In [11]:
%pip install transformers datasets evaluate torch nltk rouge_score

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


## Original Dataset

In [2]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")
dataset = dataset["train"].train_test_split(test_size=0.2)

## Optional: Preprocessed Dataset

In [21]:
import pandas as pd

# Load preprocessed datasets
df_test_preprocessed = pd.read_csv('/kaggle/input/quora-preprocessed/preprocessed_test_dataset.csv')
df_train_preprocessed = pd.read_csv('/kaggle/input/quora-preprocessed/preprocessed_train_dataset.csv')

from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(df_train_preprocessed)
test_dataset = Dataset.from_pandas(df_test_preprocessed)
dataset_preprocessed= DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


print("Datasets converted back to original format for model training.")

Datasets converted back to original format for model training.


In [22]:
from datasets import Dataset, DatasetDict

# Ensure 'question' and 'answer' columns are of type string
df_train_preprocessed['question'] = df_train_preprocessed['question'].astype(str)
df_train_preprocessed['answer'] = df_train_preprocessed['answer'].astype(str)

df_test_preprocessed['question'] = df_test_preprocessed['question'].astype(str)
df_test_preprocessed['answer'] = df_test_preprocessed['answer'].astype(str)

train_dataset = Dataset.from_pandas(df_train_preprocessed)
test_dataset = Dataset.from_pandas(df_test_preprocessed)

dataset_preprocessed = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_preprocessed)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 44248
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 11190
    })
})


In [13]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Ensure GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

# Load the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
pip install sacrebleu


Note: you may need to restart the kernel to use updated packages.


### On original Dataset

In [5]:
from datasets import Dataset
import numpy as np
import nltk
import evaluate
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.metrics import precision_recall_fscore_support

# Define fraction of the dataset to use
fraction = 0.1  # Use 1% of the dataset

# Create smaller subsets of the dataset
train_size = int(len(dataset["train"]) * fraction)
test_size = int(len(dataset["test"]) * fraction)

# Shuffle and select the subset
train_subset = dataset["train"].shuffle(seed=42).select(range(train_size))
test_subset = dataset["test"].shuffle(seed=42).select(range(test_size))

prefix = "answer the question: "

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the subset datasets
tokenized_train_dataset = train_subset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_subset.map(preprocess_function, batched=True)

# Set up Rouge, BLEU, Precision, Recall, and F1 score for evaluation
nltk.download("punkt", quiet=True)
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE score
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Compute BLEU score
    decoded_preds_for_bleu = [" ".join(pred.split()) for pred in decoded_preds]
    decoded_labels_for_bleu = [[" ".join(label.split())] for label in decoded_labels]
    bleu_result = bleu_metric.compute(predictions=decoded_preds_for_bleu, references=decoded_labels_for_bleu)

    # Compute Precision, Recall, and F1 on sentence level
    true_labels = [label.split() for label in decoded_labels]
    pred_labels = [pred.split() for pred in decoded_preds]

    # Flatten the lists
    true_labels_flat = [token for sublist in true_labels for token in sublist]
    pred_labels_flat = [token for sublist in pred_labels for token in sublist]
    
    # Ensure the lengths match by truncating to the minimum length
    min_length = min(len(true_labels_flat), len(pred_labels_flat))
    true_labels_flat = true_labels_flat[:min_length]
    pred_labels_flat = pred_labels_flat[:min_length]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, pred_labels_flat, average='weighted', zero_division=0)

    # Combine metrics
    combined_result = {
        **rouge_result,
        "bleu": bleu_result["score"],
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    return combined_result

# Set up training arguments with logging configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,  # Adjust the logging frequency as needed
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="none",  # You can also set this to "tensorboard" if using TensorBoard
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Map:   0%|          | 0/4512 [00:00<?, ? examples/s]

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Precision,Recall,F1
1,3.3076,3.112738,0.104127,0.020663,0.08366,0.094639,0.00076,0.005032,0.006707,0.004993
2,3.0897,3.092711,0.107191,0.021294,0.086656,0.096488,0.000837,0.005463,0.007766,0.005787




TrainOutput(global_step=1128, training_loss=3.2282187211598066, metrics={'train_runtime': 1208.1143, 'train_samples_per_second': 7.469, 'train_steps_per_second': 0.934, 'total_flos': 546949935267840.0, 'train_loss': 3.2282187211598066, 'epoch': 2.0})

In [24]:
trainer.save_model('./results/final_model')

## On previously imported preprocessed Dataset

In [23]:
from datasets import Dataset
import numpy as np
import nltk
import evaluate
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.metrics import precision_recall_fscore_support

# Define fraction of the dataset to use
fraction = 0.1  # Use 10% of the dataset

# Create smaller subsets of the dataset
train_size = int(len(dataset_preprocessed["train"]) * fraction)
test_size = int(len(dataset_preprocessed["test"]) * fraction)

# Shuffle and select the subset
train_subset = dataset_preprocessed["train"].shuffle(seed=42).select(range(train_size))
test_subset = dataset_preprocessed["test"].shuffle(seed=42).select(range(test_size))

prefix = "answer the question: "

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the subset datasets
tokenized_train_dataset = train_subset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_subset.map(preprocess_function, batched=True)

# Set up Rouge, BLEU, Precision, Recall, and F1 score for evaluation
nltk.download("punkt", quiet=True)
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE score
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Compute BLEU score
    decoded_preds_for_bleu = [" ".join(pred.split()) for pred in decoded_preds]
    decoded_labels_for_bleu = [[" ".join(label.split())] for label in decoded_labels]
    bleu_result = bleu_metric.compute(predictions=decoded_preds_for_bleu, references=decoded_labels_for_bleu)

    # Compute Precision, Recall, and F1 on sentence level
    true_labels = [label.split() for label in decoded_labels]
    pred_labels = [pred.split() for pred in decoded_preds]

    # Flatten the lists
    true_labels_flat = [token for sublist in true_labels for token in sublist]
    pred_labels_flat = [token for sublist in pred_labels for token in sublist]
    
    # Ensure the lengths match by truncating to the minimum length
    min_length = min(len(true_labels_flat), len(pred_labels_flat))
    true_labels_flat = true_labels_flat[:min_length]
    pred_labels_flat = pred_labels_flat[:min_length]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, pred_labels_flat, average='weighted', zero_division=0)

    # Combine metrics
    combined_result = {
        **rouge_result,
        "bleu": bleu_result["score"],
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    return combined_result

# Set up training arguments with logging configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,  # Adjust the logging frequency as needed
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="none",  # You can also set this to "tensorboard" if using TensorBoard
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Map:   0%|          | 0/4424 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Precision,Recall,F1
1,5.0709,4.787854,0.054583,0.012166,0.050109,0.050142,0.007381,0.000966,0.006326,0.001107
2,4.6984,4.738596,0.043129,0.010187,0.040196,0.040377,0.00414,0.000545,0.007778,0.000623




TrainOutput(global_step=1106, training_loss=5.003059306153337, metrics={'train_runtime': 1051.4564, 'train_samples_per_second': 8.415, 'train_steps_per_second': 1.052, 'total_flos': 330384723222528.0, 'train_loss': 5.003059306153337, 'epoch': 2.0})

In [None]:
import torch
torch.cuda.empty_cache()