# for unzip the file

In [1]:
# import zipfile
# import os

# zip_path = "archive.zip"      # Path to the zip file
# extract_to = "data"   # Folder where files will be extracted

# # Create folder if it does not exist
# os.makedirs(extract_to, exist_ok=True)

# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_to)

# print("Unzipping completed!")


# simple preprocessing step

In [3]:
import pandas as pd
import numpy as np

In [2]:
# df_val = pd.read_csv("data//samsum-validation.csv")
# df_train = pd.read_csv("data//samsum-train.csv")
# df_test = pd.read_csv("data//samsum-test.csv")

In [7]:
df_train.head(4)

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...


In [8]:
df_test.head(3)

Unnamed: 0,id,dialogue,summary
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...


In [9]:
df_val.head(4)

Unnamed: 0,id,dialogue,summary
0,13817023,"A: Hi Tom, are you busy tomorrow’s afternoon?\...",A will go to the animal shelter tomorrow to ge...
1,13716628,Emma: I’ve just fallen in love with this adven...,Emma and Rob love the advent calendar. Lauren ...
2,13829420,Jackie: Madison is pregnant\r\nJackie: but she...,Madison is pregnant but she doesn't want to ta...
3,13819648,Marla: <file_photo>\r\nMarla: look what I foun...,Marla found a pair of boxers under her bed.


In [10]:
def clean_text(text):
    if pd.isna(text):
        return ""
    return text.strip().replace("\n", " ")


In [11]:
for df in [df_train, df_val, df_test]:
    df["dialogue"] = df["dialogue"].apply(clean_text)
    df["summary"] = df["summary"].apply(clean_text)


In [12]:
df_train = df_train.rename(columns={"dialogue": "input_text", "summary": "target_text"})
df_val = df_val.rename(columns={"dialogue": "input_text", "summary": "target_text"})
df_test = df_test.rename(columns={"dialogue": "input_text","summary": "target_text"})  # test has no summary


In [13]:
df_train.to_csv("data/processed_train.csv", index=False)
df_val.to_csv("data/processed_val.csv", index=False)
df_test.to_csv("data/processed_test.csv", index=False)


# Train start from here

In [1]:
import pandas as pd

train_df = pd.read_csv("data/processed_train.csv")
val_df   = pd.read_csv("data/processed_val.csv")
test_df  = pd.read_csv("data/processed_test.csv")

for df in [train_df, val_df, test_df]:
    df["input_text"] = df["input_text"].astype(str)
    df["target_text"] = df["target_text"].astype(str)

    # Replace 'nan', 'None', 'NaN', float nan with empty string
    df["input_text"] = df["input_text"].replace("nan", "").replace("None", "")
    df["target_text"] = df["target_text"].replace("nan", "").replace("None", "")

    df["input_text"] = df["input_text"].fillna("")
    df["target_text"] = df["target_text"].fillna("")



In [2]:
from torch.utils.data import Dataset
import torch

class SummDataset(Dataset):
    def __init__(self, df, tokenizer, max_input=256, max_output=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_input = max_input
        self.max_output = max_output

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        input_text = "summarize: " + str(row["input_text"])
        target_text = str(row["target_text"])

        inputs = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_input
        )

        labels = self.tokenizer(
            target_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_output
        )["input_ids"]

        # Replace pad tokens with -100
        labels = [l if l != self.tokenizer.pad_token_id else -100 for l in labels]

        inputs["labels"] = labels

        return {k: torch.tensor(v) for k, v in inputs.items()}


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_name = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("T5-base loaded successfully!")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5-base loaded successfully!


In [4]:
# !pip uninstall -y transformers peft
# !pip install transformers==4.44.2 peft==0.12.0 accelerate
# !pip install rouge-score


In [5]:
from torch.utils.data import DataLoader

train_dataset = SummDataset(train_df, tokenizer)
val_dataset = SummDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


In [6]:
# # from tqdm.auto import tqdm
# # from torch.optim import AdamW
# # from rouge_score import rouge_scorer

# # optimizer = AdamW(model.parameters(), lr=3e-5)
# # scaler = torch.cuda.amp.GradScaler()
# # scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# # accum_steps = 4
# # epochs = 5

# # for epoch in range(epochs):
# #     model.train()
# #     progress = tqdm(train_loader, desc=f"Epoch {epoch+1}")

# #     total_loss = 0

# #     for i, batch in enumerate(progress):
# #         batch = {k: v.to(device) for k, v in batch.items()}

# #         with torch.amp.autocast("cuda"):
# #             outputs = model(**batch)
# #             loss = outputs.loss / accum_steps

# #         scaler.scale(loss).backward()

# #         if (i + 1) % accum_steps == 0:
# #             scaler.step(optimizer)
# #             scaler.update()
# #             optimizer.zero_grad()

# #         total_loss += loss.item()

# #         # -------------------------
# #         # ROUGE every 200 steps
# #         # -------------------------
# #         if i % 200 == 0:
# #             model.eval()
# #             with torch.no_grad():
# #                 summary_ids = model.generate(
# #                     batch["input_ids"][0].unsqueeze(0),
# #                     attention_mask=batch["attention_mask"][0].unsqueeze(0),
# #                     max_new_tokens=64
# #                 )

# #                 pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# #                 true = tokenizer.decode(
# #                     [x for x in batch["labels"][0].tolist() if x != -100],
# #                     skip_special_tokens=True
# #                 )

# #                 rouge = scorer.score(true, pred)["rougeL"].fmeasure

# #             model.train()
# #             progress.set_postfix({
# #                 "loss": f"{loss.item():.4f}",
# #                 "rougeL": f"{rouge:.4f}"
# #             })

# #     print(f"Epoch {epoch+1} | Total Loss = {total_loss:.4f}")


# from tqdm.auto import tqdm
# from torch.optim import AdamW
# from rouge_score import rouge_scorer

# optimizer = AdamW(model.parameters(), lr=3e-5)
# scaler = torch.cuda.amp.GradScaler()
# scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# accum_steps = 4
# epochs = 15

# for epoch in range(epochs):
#     model.train()
#     progress = tqdm(train_loader, desc=f"Epoch {epoch+1}")

#     total_loss = 0

#     # -------------------------
#     # TRAINING LOOP
#     # -------------------------
#     for i, batch in enumerate(progress):
#         batch = {k: v.to(device) for k, v in batch.items()}

#         with torch.amp.autocast("cuda"):
#             outputs = model(**batch)
#             loss = outputs.loss / accum_steps

#         scaler.scale(loss).backward()

#         if (i + 1) % accum_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()

#         total_loss += loss.item()

#         # Small training ROUGE sample print every 200 steps
#         if i % 200 == 0:
#             model.eval()
#             with torch.no_grad():
#                 summary_ids = model.generate(
#                     batch["input_ids"][0].unsqueeze(0),
#                     attention_mask=batch["attention_mask"][0].unsqueeze(0),
#                     max_new_tokens=64
#                 )
#                 pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#                 true = tokenizer.decode(
#                     [x for x in batch["labels"][0].tolist() if x != -100],
#                     skip_special_tokens=True
#                 )
#                 rouge = scorer.score(true, pred)["rougeL"].fmeasure

#             model.train()
#             progress.set_postfix({
#                 "loss": f"{loss.item():.4f}",
#                 "rougeL": f"{rouge:.4f}"
#             })

#     print(f"\nEpoch {epoch+1} | Training Loss = {total_loss:.4f}")

#     # -------------------------
#     # VALIDATION LOOP
#     # -------------------------
#     model.eval()
#     val_rouge_scores = []
    
#     with torch.no_grad():
#         for batch in tqdm(val_loader, desc="Validation"):
#             input_ids = batch["input_ids"].to(device)
#             attn_mask = batch["attention_mask"].to(device)

#             labels = batch["labels"].tolist()

#             summary_ids = model.generate(input_ids, attention_mask=attn_mask, max_new_tokens=64)

#             for pred_tokens, true_tokens in zip(summary_ids, labels):
#                 pred = tokenizer.decode(pred_tokens, skip_special_tokens=True)
#                 true = tokenizer.decode([x for x in true_tokens if x != -100], skip_special_tokens=True)

#                 score = scorer.score(true, pred)["rougeL"].fmeasure
#                 val_rouge_scores.append(score)

#     avg_val_rouge = sum(val_rouge_scores) / len(val_rouge_scores)
#     print(f"Epoch {epoch+1} | Validation ROUGE-L = {avg_val_rouge:.4f}")



In [16]:
import os
import torch
from tqdm.auto import tqdm
from torch.optim import AdamW
from rouge_score import rouge_scorer

# -----------------------------
# DEVICE FIX
# -----------------------------
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print("Using:", device)

# -----------------------------
# OPTIMIZER, SCALER, ROUGE
# -----------------------------
optimizer = AdamW(model.parameters(), lr=3e-5)
scaler = torch.cuda.amp.GradScaler() if device == "cuda" else None
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

accum_steps = 4
epochs = 10
CHECKPOINT_DIR = "checkpoints"

# Create checkpoint folder
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# ==========================================================
# LOAD CHECKPOINT IF AVAILABLE
# ==========================================================
start_epoch = 0

def load_latest_checkpoint():
    global start_epoch

    folders = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith("epoch_")]
    if not folders:
        print("No checkpoint found. Starting fresh.")
        return

    # Sort descending → try newest first
    folders = sorted(folders, key=lambda x: int(x.split("_")[1]), reverse=True)

    for folder in folders:
        path = os.path.join(CHECKPOINT_DIR, folder)
        model_file = f"{path}/model.pt"

        print(f"\nTrying checkpoint: {folder}")

        try:
            # Try state_dict load
            state = torch.load(model_file, map_location=device)
            model.load_state_dict(state)
            print(f"Loaded state_dict from {folder}")

            # Load optimizer
            optimizer_path = f"{path}/optimizer.pt"
            if os.path.exists(optimizer_path):
                optimizer.load_state_dict(torch.load(optimizer_path, map_location=device))
                print("Loaded optimizer")

            # Load scaler (if CUDA)
            if scaler:
                scaler_path = f"{path}/scaler.pt"
                if os.path.exists(scaler_path):
                    scaler.load_state_dict(torch.load(scaler_path, map_location=device))
                    print("Loaded scaler")

            # Load training state
            state_file = f"{path}/training_state.pt"
            if os.path.exists(state_file):
                ts = torch.load(state_file)
                start_epoch = ts["epoch"] + 1
                print(f"Resuming from epoch {start_epoch}")
            else:
                print("No training_state.pt, starting next epoch")

            return  # SUCCESS → STOP LOOP

        except Exception as e:
            print(f"Failed to load checkpoint {folder}: {e}")

    print("No valid checkpoints found. Starting fresh.")




load_latest_checkpoint()



# ==========================================================
# TRAINING LOOP WITH CHECKPOINT SAVE
# ==========================================================
for epoch in range(start_epoch, epochs):
    model.train()
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    total_loss = 0

    # -------------------------
    # TRAIN LOOP
    # -------------------------
    for i, batch in enumerate(progress):
        batch = {k: v.to(device) for k, v in batch.items()}

        # Only use autocast for CUDA
        if device == "cuda":
            with torch.amp.autocast("cuda"):
                outputs = model(**batch)
                loss = outputs.loss / accum_steps
        else:
            outputs = model(**batch)
            loss = outputs.loss / accum_steps

        if scaler:
            scaler.scale(loss).backward()
        else:
            loss.backward()

        if (i + 1) % accum_steps == 0:
            if scaler:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()

            optimizer.zero_grad()

        total_loss += loss.item()

        # Show ROUGE every 200 steps
        if i % 200 == 0:
            model.eval()
            with torch.no_grad():
                summary_ids = model.generate(
                    batch["input_ids"][0].unsqueeze(0),
                    attention_mask=batch["attention_mask"][0].unsqueeze(0),
                    max_new_tokens=64
                )
                pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                true = tokenizer.decode(
                    [x for x in batch["labels"][0].tolist() if x != -100],
                    skip_special_tokens=True
                )
                rouge = scorer.score(true, pred)["rougeL"].fmeasure

            model.train()
            progress.set_postfix({
                "loss": f"{loss.item():.4f}",
                "rougeL": f"{rouge:.4f}"
            })

    print(f"\nEpoch {epoch+1} | Training Loss = {total_loss:.4f}")


    # -------------------------
    # VALIDATION LOOP
    # -------------------------
    model.eval()
    val_scores = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            labels = batch["labels"].tolist()

            summary_ids = model.generate(input_ids, attention_mask=attn)

            for pred_ids, true_ids in zip(summary_ids, labels):
                pred = tokenizer.decode(pred_ids, skip_special_tokens=True)
                true = tokenizer.decode([x for x in true_ids if x != -100], skip_special_tokens=True)
                val_scores.append(scorer.score(true, pred)["rougeL"].fmeasure)

    avg_rouge = sum(val_scores) / len(val_scores)
    print(f"Epoch {epoch+1} | Validation ROUGE-L = {avg_rouge:.4f}")


    # ======================================================
    # SAVE CHECKPOINT SAFELY
    # ======================================================
    save_path = f"{CHECKPOINT_DIR}/epoch_{epoch}"
    os.makedirs(save_path, exist_ok=True)

    torch.save(model.state_dict(), f"{save_path}/model.pt")
    torch.save(optimizer.state_dict(), f"{save_path}/optimizer.pt")

    if scaler:
        torch.save(scaler.state_dict(), f"{save_path}/scaler.pt")

    torch.save({"epoch": epoch}, f"{save_path}/training_state.pt")

    print(f"Checkpoint saved: {save_path}\n")


Using: cuda

Trying checkpoint: epoch_9
Failed to load checkpoint epoch_9: PytorchStreamReader failed reading zip archive: failed finding central directory

Trying checkpoint: epoch_8


  scaler = torch.cuda.amp.GradScaler() if device == "cuda" else None


Loaded state_dict from epoch_8
Loaded optimizer
Loaded scaler
Resuming from epoch 9


Epoch 10:   0%|          | 0/3683 [00:00<?, ?it/s]


Epoch 10 | Training Loss = nan


Validation:   0%|          | 0/205 [00:00<?, ?it/s]

Epoch 10 | Validation ROUGE-L = 0.3761
Checkpoint saved: checkpoints/epoch_9



In [None]:
# def generate_summary(text):
#     inputs = tokenizer([text], return_tensors="pt", truncation=True, max_length=512).to("cuda")

#     output = model.generate(
#         **inputs,
#         max_new_tokens=128,
#         num_beams=4,
#         length_penalty=2.0,
#         early_stopping=True
#     )

#     return tokenizer.decode(output[0], skip_special_tokens=True)


# for model save

In [8]:
model.save_pretrained("t5_summarizer/")
tokenizer.save_pretrained("t5_summarizer/")
print("Model save sucessfully")

Model save sucessfully


# use save model

In [18]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# --------------------------------------------------
# 1. Load your saved model + tokenizer
# --------------------------------------------------

model_dir = "t5_summarizer/"   # PATH of your saved model folder

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model = model.to(device)
model.eval()

print("Model loaded successfully from:", model_dir)

# --------------------------------------------------
# 2. Summarization Function (Improved)
# --------------------------------------------------
def summarize_text(text):
    """
    Smart summarizer:
    - auto-adjusts summary length based on input size
    - prevents premature stopping
    - avoids repetition
    - ensures output is complete
    """

    # ----------------------------------------
    # 1. Measure input length
    # ----------------------------------------
    input_len = len(tokenizer.encode(text))

    # Auto length selection (optimized)
    if input_len < 80:
        out_len = 30
    elif input_len < 150:
        out_len = 40
    elif input_len < 250:
        out_len = 70
    elif input_len < 350:
        out_len = 110
    else:
        out_len = 130
  # Cap to avoid GPU overload

    # Minimum length (very important)
    min_len = max(30, out_len // 3)

    # ----------------------------------------
    # 2. Tokenize Input
    # ----------------------------------------
    inputs = tokenizer(
        "summarize: " + text,
        return_tensors="pt",
        truncation=True,
        max_length=512   # T5-base limit
    ).to(device)

    # ----------------------------------------
    # 3. Generate Summary with Safe Settings
    # ----------------------------------------
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=out_len,
            min_length=min_len,
            num_beams=4,                 # 4 beams is more stable than 6
            no_repeat_ngram_size=2,      # safer, less cutting
            repetition_penalty=1.3,      # balanced; avoids early cutoff
            length_penalty=0.8,          # allows longer output
            early_stopping=False,
        )

    # ----------------------------------------
    # 4. Decode Output
    # ----------------------------------------
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)



Model loaded successfully from: t5_summarizer/


In [19]:

if __name__ == "__main__":
    text = """
   User: My code keeps showing a null pointer exception.
Agent: Which line is causing it?
User: Line 42.
Agent: That means you're accessing an object that wasn't initialized.
User: How do I fix it?
Agent: Add a null check or initialize the object before use.


    """

    print("\n========== SUMMARY ==========\n")
    print(summarize_text(text))
    print("\n=============================\n")



User's code shows a null pointer exception on line 42. Agent advises User to check the object or initialize it before use




In [None]:
the tortoise continued to plod along slowly and steadily, 
eventually crossing the finish line while the rabbit was asleep. 
the rabbit took the lead and, confident in his victory, decided to take a nap.

In [None]:
Slow tortoise challenged rabbit to race. The rabbit took a nap,
believing the turtle was too slow to catch up with him. It eventually 
crossed the finish line while the rabbit was asleep.

In [None]:
your paragraphs should remind your reader that there is a recurrent relationship between your thesis and the information in each paragraph. a working thesis functions like a seed from which your paper.

In [None]:
import pandas as pd

df_test = pd.read_csv("data/processed_test.csv")

df_test["generated_summary"] = df_test["input_text"].apply(summarize_text)

df_test.to_csv("data/test_predictions_t5.csv", index=False)

print("Predictions saved to data/test_predictions_t5.csv")
