In [None]:
!pip install -q transformers datasets peft accelerate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m29.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType
import torch


In [None]:
# Load and extract data from JSON
with open("/content/deepseek_json_20250421_7e1987.json", "r") as f:  # Replace path with your path in the code
    raw_data = json.load(f)

# Extract the actual list under "dataset"
data = raw_data["dataset"]

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Train-test split (80% train, 20% test)
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
model_name = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/8 [00:00<?, ? examples/s]



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q", "v"],  # LoRA applied to attention layers
#     lora_dropout=0.1,
#     bias="none",
#     task_type=TaskType.SEQ_2_SEQ_LM
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()

# Try lowering the rank (e.g., from 8 to 4)
lora_config = LoraConfig(
    r=4,  # Lower rank to reduce overfitting
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/t5_lora_summary",
    eval_strategy="epoch",  # You can set it to "steps" for more frequent evaluations
    logging_strategy="steps",  # Use 'steps' to log training every certain number of steps
    logging_steps=50,  # Log every 50 steps
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Enable mixed precision on GPU
    logging_dir="/content/logs",  # Specify where to store logs
    logging_first_step=True,  # Log the first step as well
    report_to="tensorboard",  # Use TensorBoard for visualization if needed
    save_steps=100,  # Save model checkpoint every 100 steps
)



In [None]:
import os

# Tell Transformers to ignore W&B
os.environ["WANDB_DISABLED"] = "true"

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,  # After 50 epochs loss did'nt changed
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",  # 👈 This disables W&B and other integrations
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,17.711287
2,No log,17.37961
3,No log,16.739428
4,No log,16.739428
5,No log,15.856818
6,No log,15.192581
7,No log,14.541286
8,No log,13.389382
9,No log,12.534728
10,No log,11.293706


KeyboardInterrupt: 

In [None]:
# Try generating summary from one of the test samples
sample_input = "summarize: " + dataset["test"][0]["article"]
inputs = tokenizer(sample_input, return_tensors="pt", truncation=True, max_length=512).to(model.device)

summary_ids = model.generate(**inputs, max_new_tokens=150)
print("Generated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))
print("Actual Summary:", dataset["test"][0]["summary"])


In [None]:
!pip install datasets




In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=81997a4ed3e467ad581627be1a23ca36465e2c5f20654e90338dad3130f1b9fe
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import evaluate
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Step 1: Load the Pretrained T5-small Model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Step 2: Set up LoRA for Fine-Tuning
lora_config = LoraConfig(
    r=4,  # LoRA rank (you can tune this)
    lora_alpha=32,
    target_modules=["q", "v"],  # Only apply LoRA to attention modules
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Step 3: Load Dataset from JSON
dataset_path = "/content/deepseek_json_20250421_7e1987.json"

# Important: specify the nested field 'dataset'
dataset = load_dataset("json", data_files=dataset_path, field="dataset", split="train")

# (Optional) Split into train and validation if needed
# dataset = dataset.train_test_split(test_size=0.2)
# train_dataset = dataset["train"]
# val_dataset = dataset["test"]
train_dataset = dataset
val_dataset = dataset  # Using same data for simplicity (not ideal)

# Step 4: Preprocess the Dataset
def preprocess_function(examples):
    inputs = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Step 5: Set up Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save every epoch to match evaluation
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=50,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    logging_first_step=True
)

# Step 6: Load ROUGE Metric
rouge = evaluate.load("rouge")
# Compute ROUGE during evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline between each sentence
    decoded_preds = ["\n".join(pred.strip().split('. ')) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split('. ')) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

# Step 7: Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 8: Start Training
trainer.train()

# Step 9: Final Evaluation (Optional)
print("Final evaluation on the dataset:")
eval_results = trainer.evaluate()
print(eval_results)


trainable params: 147,456 || all params: 60,654,080 || trainable%: 0.2431


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,17.9628,18.287567,25.6425,7.1152,22.3028,23.0031
2,17.9628,15.447701,25.6425,7.1152,22.3028,23.0031
3,17.9628,12.221801,28.2716,8.6899,23.5801,24.8324
4,15.5071,8.007557,28.4377,8.1693,22.8667,23.6343
5,15.5071,1.501908,12.3598,2.2857,9.9337,10.6876
6,15.5071,0.484052,0.0,0.0,0.0,0.0
7,5.0231,0.525792,0.0,0.0,0.0,0.0
8,5.0231,0.610551,0.0,0.0,0.0,0.0
9,5.0231,0.582751,0.0,0.0,0.0,0.0
10,0.7676,0.517315,0.0,0.0,0.0,0.0


Final evaluation on the dataset:


{'eval_loss': 0.13723790645599365, 'eval_rouge1': 45.187, 'eval_rouge2': 24.5045, 'eval_rougeL': 42.2808, 'eval_rougeLsum': 41.9227, 'eval_runtime': 1.2491, 'eval_samples_per_second': 8.006, 'eval_steps_per_second': 2.402, 'epoch': 50.0}


In [None]:
def summarize_text(text, max_input_length=512, max_new_tokens=50):
    model.eval()
    input_text = "summarize: " + text
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    ).to(model.device)

    # Use keyword argument
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=max_new_tokens)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary


In [None]:
example = (
    "Scientists at MIT have developed a new battery technology that charges in just 5 minutes. "
    "The sodium-ion design avoids rare earth metals making it more sustainable. Testing shows "
    "these batteries maintain 95% capacity after 1000 cycles. Several automakers have already licensed "
    "the technology for electric vehicles. Researchers estimate commercial production could begin within 18 months. "
    "This breakthrough could address key barriers to widespread EV adoption."
)

print("Summary:", summarize_text(example))


Summary: MIT scientists develop sodium-ion battery that could be used in electric vehicles.


In [None]:
# Testing the base and fine tuned :


import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load models (make sure these are defined in your environment)
base_model_name = "t5-small"
base_tokenizer = T5Tokenizer.from_pretrained(base_model_name)
base_model = T5ForConditionalGeneration.from_pretrained(base_model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Your fine-tuned model and tokenizer (should be defined from your training code)
# model = ... (your fine-tuned PEFT model)
# tokenizer = ... (your tokenizer)

def compare_models(text, max_input_len=512, max_output_len=50):
    input_text = "summarize: " + text

    # Tokenize for both models
    inputs_base = base_tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_len).to(base_model.device)
    inputs_ft = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_len).to(model.device)

    # Generate summaries
    with torch.no_grad():
        summary_base = base_model.generate(
            input_ids=inputs_base["input_ids"],
            attention_mask=inputs_base["attention_mask"],
            max_length=max_output_len,
            num_return_sequences=1
        )

        summary_ft = model.generate(
            input_ids=inputs_ft["input_ids"],
            attention_mask=inputs_ft["attention_mask"],
            max_length=max_output_len,
            num_return_sequences=1
        )

    # Decode results
    decoded_base = base_tokenizer.decode(summary_base[0], skip_special_tokens=True)
    decoded_ft = tokenizer.decode(summary_ft[0], skip_special_tokens=True)

    return decoded_base, decoded_ft

# Example usage
example = "Scientists at MIT have developed a new battery technology that charges in just 5 minutes. The sodium-ion design avoids rare earth metals making it more sustainable. Testing shows these batteries maintain 95% capacity after 1000 cycles. Several automakers have already licensed the technology for electric vehicles. Researchers estimate commercial production could begin within 18 months. This breakthrough could address key barriers to widespread EV adoption."

base, fine_tuned = compare_models(example)

print("🔹 Base T5:", base)
print("🔸 Fine-Tuned T5:", fine_tuned)

🔹 Base T5: sodium-ion design avoids rare earth metals making it more sustainable. tests show these batteries maintain 95% capacity after 1000 cycles.
🔸 Fine-Tuned T5: MIT scientists develop sodium-ion battery that could be used in electric vehicles.


In [None]:
from rouge_score import rouge_scorer

# Reference (ground truth)
reference = "MIT researchers create fast-charging sodium-ion battery that could revolutionize electric vehicles."

# Model outputs
base_output = "sodium-ion design avoids rare earth metals making it more sustainable. tests show these batteries maintain 95% capacity after 1000 cycles."
finetuned_output = "MIT scientists develop sodium-ion battery that could be used in electric vehicles."

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute scores for Base T5
base_scores = scorer.score(reference, base_output)

# Compute scores for Fine-Tuned T5
finetuned_scores = scorer.score(reference, finetuned_output)

print("🔹 Base T5 ROUGE Scores:")
print(f"ROUGE-1: {base_scores['rouge1'].fmeasure:.3f}")
print(f"ROUGE-2: {base_scores['rouge2'].fmeasure:.3f}")
print(f"ROUGE-L: {base_scores['rougeL'].fmeasure:.3f}")

print("\n🔸 Fine-Tuned T5 ROUGE Scores:")
print(f"ROUGE-1: {finetuned_scores['rouge1'].fmeasure:.3f}")
print(f"ROUGE-2: {finetuned_scores['rouge2'].fmeasure:.3f}")
print(f"ROUGE-L: {finetuned_scores['rougeL'].fmeasure:.3f}")

🔹 Base T5 ROUGE Scores:
ROUGE-1: 0.176
ROUGE-2: 0.062
ROUGE-L: 0.176

🔸 Fine-Tuned T5 ROUGE Scores:
ROUGE-1: 0.615
ROUGE-2: 0.417
ROUGE-L: 0.615
