In [1]:
# =============================
# 🚀 Google Colab T5 Fine-tuning for Chemistry Correction
# =============================
!pip install transformers datasets sentencepiece accelerate -q

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)


In [4]:
# =============================
# 1. Load Dataset
# =============================
csv_path = "/content/chemistry_corrections_large.csv"  # upload this file to Colab first
df = pd.read_csv(csv_path)

# Keep only needed columns
df = df[["incorrect_statement", "corrected_statement"]].dropna()

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [5]:
# =============================
# 2. Train-Test Split
# =============================
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [6]:
# =============================
# 3. Load Tokenizer and Model
# =============================
model_name = "t5-small"  # small + fast, you can try "t5-base" later
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Correct: {s}" for s in examples["incorrect_statement"]]
    targets = examples["corrected_statement"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1640 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

In [8]:
# =============================
# 4. Load Model
# =============================
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# =============================
# 5. Training Configuration
# =============================
training_args = TrainingArguments(
    output_dir="./t5_chemistry_correction",
    eval_strategy="epoch", # Changed from evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # increase if dataset is big
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
)

In [12]:
# =============================
# 6. Initialize Trainer and Train
# =============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahmedyassine-sablaoui[0m ([33mbahri-nourhene-istic-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.1304,0.050536
2,0.0378,0.021381
3,0.0269,0.014131
4,0.0206,0.011473
5,0.0188,0.010495


TrainOutput(global_step=1025, training_loss=0.24866902639226215, metrics={'train_runtime': 258.1782, 'train_samples_per_second': 31.761, 'train_steps_per_second': 3.97, 'total_flos': 277450693017600.0, 'train_loss': 0.24866902639226215, 'epoch': 5.0})

In [13]:
# =============================
# 7. Save Fine-Tuned Model
# =============================
output_dir = "/content/fine_tuned_t5_chemistry"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Model saved at:", output_dir)

✅ Model saved at: /content/fine_tuned_t5_chemistry


In [15]:
# =============================
# 8. Quick Test
# =============================
from transformers import pipeline

corrector = pipeline("text2text-generation", model=output_dir, tokenizer=output_dir)

test_input = "An anion is a positively charged ion formed by losing electrons.."
result = corrector(f"Correct: {test_input}", max_length=128, num_beams=4)
print("❌ Incorrect:", test_input)
print("✅ Corrected:", result[0]["generated_text"])

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


❌ Incorrect: An anion is a positively charged ion formed by losing electrons..
✅ Corrected: An anion is a negatively charged ion formed by gaining electrons.


In [16]:
!zip -r fine_tuned_t5_chemistry.zip /content/fine_tuned_t5_chemistry


  adding: content/fine_tuned_t5_chemistry/ (stored 0%)
  adding: content/fine_tuned_t5_chemistry/tokenizer.json (deflated 74%)
  adding: content/fine_tuned_t5_chemistry/model.safetensors (deflated 10%)
  adding: content/fine_tuned_t5_chemistry/config.json (deflated 63%)
  adding: content/fine_tuned_t5_chemistry/tokenizer_config.json (deflated 95%)
  adding: content/fine_tuned_t5_chemistry/special_tokens_map.json (deflated 85%)
  adding: content/fine_tuned_t5_chemistry/generation_config.json (deflated 27%)
  adding: content/fine_tuned_t5_chemistry/spiece.model (deflated 48%)
