In [1]:
# ----------------------------
# 1. Install Dependencies
# ----------------------------
!pip install -q transformers datasets pandas torch
!pip install -q sentencepiece


# ----------------------------
# 2. Import Modules
# ----------------------------
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
import torch
import os

# ----------------------------
# 3. Load and Clean Data
# ----------------------------
df = pd.read_csv("generator_data.csv")
print("Original length:", len(df))
print("Missing values:\n", df.isna().sum())

# Rename columns for clarity
df = df.rename(columns={"target_dna": "input_text", "grna_seq": "target_text"})

# Drop missing or duplicate rows
df = df.dropna(subset=["input_text", "target_text"])
df = df.drop_duplicates(subset=["input_text", "target_text"])

# Save cleaned data
df.to_csv("generator_data_clean.csv", index=False)

# ----------------------------
# 4. Load Dataset
# ----------------------------
dataset = load_dataset("csv", data_files="generator_data_clean.csv")
print("Example:", dataset["train"][0])

# ----------------------------
# 5. Load Tokenizer and Model
# ----------------------------
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ----------------------------
# 6. Preprocessing Function
# ----------------------------
def preprocess(example):
    input_seq = example["input_text"].strip()
    target_seq = example["target_text"].strip()
    model_input = tokenizer("generate gRNA: " + input_seq, padding="max_length", truncation=True, max_length=64)
    label = tokenizer(text_target=target_seq, padding="max_length", truncation=True, max_length=32)
    model_input["labels"] = label["input_ids"]
    return model_input

# Tokenize
tokenized_dataset = dataset.map(preprocess, batched=False)
tokenized_dataset.set_format("torch")

# ----------------------------
# 7. Training Arguments
# ----------------------------

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,        # use 8 if you're hitting memory issues
    num_train_epochs=5,                   # increase epochs if model underfits (loss stuck ~0.8)
    learning_rate=5e-5,                   # explicitly set learning rate
    weight_decay=0.01,                    # help regularize if you're overfitting
    logging_steps=50,                     # lower logging frequency if your dataset is large
    save_strategy="no",                   # keep "no" to avoid disk quota issue
    fp16=torch.cuda.is_available(),
    report_to="none"
)


# ----------------------------
# 8. Define Trainer
# ----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)

# ----------------------------
# 9. Temporary DTensor Patch
# ----------------------------
import builtins
class DTensor: pass
builtins.DTensor = DTensor

# ----------------------------
# 10. Train Model
# ----------------------------
trainer.train()


2025-06-04 20:29:28.452365: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749068968.474384    1747 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749068968.481101    1747 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-04 20:29:28.504579: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Original length: 38864
Missing values:
 input_text     31
target_text     0
dtype: int64


Generating train split: 0 examples [00:00, ? examples/s]

Example: {'input_text': 'CGGCGCTGGTGCCCAGGACGAGGATGGAGATT', 'target_text': 'CGGCGCTGGTGCCCAGGACGAGGATGGAGATT'}


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/33411 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,4.0591
100,1.5114
150,1.132
200,1.0286
250,1.0288
300,0.981
350,0.954
400,0.953
450,0.932
500,0.9425


TrainOutput(global_step=20885, training_loss=0.7529704501476475, metrics={'train_runtime': 1897.6664, 'train_samples_per_second': 88.032, 'train_steps_per_second': 11.006, 'total_flos': 2826190580613120.0, 'train_loss': 0.7529704501476475, 'epoch': 5.0})

In [3]:
# Load model again if needed
# model = T5ForConditionalGeneration.from_pretrained("./results")  # Skip if save_strategy="no"

# Generate from a DNA sequence
dna = "TGCAGATCACGAGGGAAGAGGGGGAAGGGATT"
input_ids = tokenizer("generate gRNA: " + dna, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)
print("Target DNA:", tokenizer.decode(outputs[0], skip_special_tokens=True))
print("Best gRNA:", 
print("Generated gRNA:", tokenizer.decode(outputs[0], skip_special_tokens=True))



Predicted gRNA: GGCAGATCACGAGGGAAGAGGGAGA


In [4]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("./results")  # or checkpoint path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load shortList.csv
df = pd.read_csv("shortList.csv")
df.columns = df.columns.str.strip()

# Loop through each row and generate gRNA
for idx, row in df.iterrows():
    target = row["Target_sequence"]
    best_grna = row["Guide_sequence"]
    
    input_text = "generate gRNA: " + target
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Target DNA:     {target}")
    print(f"Best gRNA:      {best_grna}")
    print(f"Generated gRNA: {generated}")
    print("-" * 50)


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./results.