In [None]:
!pip install evaluate rouge rouge_score

In [None]:
from kaggle_secrets import UserSecretsClient

# Retrieve the Hugging Face API token from Kaggle secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
import torch,nltk,spacy,string,transformers,json,evaluate,warnings
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")

In [None]:
# nltk.download('punkt')          # Tokenizer models
# nltk.download('wordnet')        # WordNet lexical database
# nltk.download('omw-1.4')        # Open Multilingual WordNet
# nltk.download('averaged_perceptron_tagger')  # POS tagger
# nltk.download('stopwords')      # Common stop words
# nltk.download('vader_lexicon')  # Sentiment analysis lexicon

In [None]:
model_name = "aayeshanakarmi/T5-QG-finetuned-squad"

TOKENIZER = T5Tokenizer.from_pretrained(model_name, use_auth_token=hf_token)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load the model from the Hugging Face model hub
MODEL = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=hf_token, return_dict=True)

# MODEL = T5ForConditionalGeneration.from_pretrained("t5-small", return_dict=True)
MODEL.to(DEVICE)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
EPOCHS = 5
OUTPUT_DIR = '/kaggle/tmp/'
OUTPUT_MODEL_NAME = 'AQG-finetuned-squad-lite'

In [None]:
# Loading the data

with open('/kaggle/input/squad-20/train-v2.0.json') as f:
    data = json.load(f)

In [None]:
# Extracting context, question, and answers from the dataset

def prepare_data(data):
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                    answer = qa["answers"][0]["text"]
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

In [None]:
data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)
data

In [None]:
# data = data.sample(n=1000, random_state=42)
# data = data.reset_index(drop=True)
data, test_data = train_test_split(data, test_size=0.1, random_state=42)

data = data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print(f"Data size: {len(data)}")
print(f"Testing data size: {len(test_data)}")

In [None]:
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]
        
        # Tokenizing the question and context pair with truncation
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, 
                                            padding="max_length", truncation=True, 
                                            add_special_tokens=True)
        
        # Tokenizing the answer with truncation
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, 
                                          padding="max_length", truncation=True, 
                                          add_special_tokens=True)
        
        # Preparing the labels
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100  # Masking padding tokens in the labels
        
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }


In [None]:
# Dataloader
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [None]:
print(len(train_loader),len(val_loader))

In [None]:
import logging
logging.disable(logging.WARNING)

# Lists to store loss values for each epoch
train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    # Training phase
    MODEL.train()
    train_loss = 0
    train_batch_count = 0
    
    # Loop through training data
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{EPOCHS}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        # Forward pass
        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        # Backpropagation
        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()

        # Accumulate training loss
        train_loss += outputs.loss.item()
        train_batch_count += 1

    # Compute average training loss for the epoch
    avg_train_loss = train_loss / train_batch_count
    train_losses.append(avg_train_loss)

    # Validation phase
    MODEL.eval()
    val_loss = 0
    val_batch_count = 0

    with torch.no_grad():  # Disable gradient calculation during validation
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{EPOCHS}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

            # Forward pass
            outputs = MODEL(
                              input_ids=input_ids,
                              attention_mask=attention_mask,
                              labels=labels,
                              decoder_attention_mask=decoder_attention_mask
                            )

            # Accumulate validation loss
            val_loss += outputs.loss.item()
            val_batch_count += 1

    # Compute average validation loss for the epoch
    avg_val_loss = val_loss / val_batch_count
    val_losses.append(avg_val_loss)

    # Print out losses after each epoch (not each batch)
    print(f"Epoch {epoch+1}/{EPOCHS} -> Train loss: {avg_train_loss:.4f}\tValidation loss: {avg_val_loss:.4f}")


In [None]:
# Plotting the loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, EPOCHS+1), train_losses, marker='o', label='Train Loss')
plt.plot(range(1, EPOCHS+1), val_losses, marker='o', label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
MODEL.save_pretrained(f'{OUTPUT_DIR}{OUTPUT_MODEL_NAME}')
TOKENIZER.save_pretrained(f'{OUTPUT_DIR}{OUTPUT_MODEL_NAME}')

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
from huggingface_hub import login

# Replace 'your_token' with your actual Hugging Face token
login(token=hf_token)


In [None]:
# Save the fine-tuned model and tokenizer locally
MODEL.save_pretrained("T5-QuestionAnswering-squad-10")
TOKENIZER.save_pretrained("T5-QuestionAnswering-squad-10")

In [None]:
# Replace 'your_token' with your actual Hugging Face token
MODEL.push_to_hub("T5-QuestionAnswering-squad-10", use_auth_token=hf_token, use_temp_dir=False)
TOKENIZER.push_to_hub("T5-QuestionAnswering-squad-10", use_auth_token=hf_token, use_temp_dir=False)

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim.lr_scheduler import StepLR
import warnings
warnings.filterwarnings("ignore")

# Model Configuration
model_name = "t5-small"  # You can change this to any T5 variant
TOKENIZER = T5Tokenizer.from_pretrained(model_name)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MODEL = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)
MODEL.to(DEVICE)

# Training Parameters
Q_LEN = 512  # Increased for context + question
T_LEN = 128  # Increased for potential longer answers
BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-4
OUTPUT_DIR = './models/'
OUTPUT_MODEL_NAME = 'T5-answer-generation'

# Data Preparation
def prepare_data(csv_path):
    df = pd.read_csv(csv_path, encoding='latin-1')  # Adjust encoding as needed
    df['input_text'] = 'Instruction: Generate a detailed answer to the question using the provided context. Provide the answer only without including any additional content. ' + df['Question'] + ' [SEP] ' + df['Context']
    df['target_text'] = df['Answer']
    return df[['input_text', 'target_text']]

# Custom Dataset class
class AnswerGenerationDataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.inputs = dataframe['input_text'].values
        self.targets = dataframe['target_text'].values
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])
        
        # Tokenize inputs
        input_encodings = self.tokenizer(input_text, 
                                       max_length=self.q_len,
                                       padding='max_length',
                                       truncation=True,
                                       return_tensors="pt")
        
        # Tokenize targets
        target_encodings = self.tokenizer(target_text,
                                        max_length=self.t_len,
                                        padding='max_length',
                                        truncation=True,
                                        return_tensors="pt")
        
        # Prepare labels by masking padding tokens
        labels = target_encodings['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': labels.squeeze(),
            'decoder_attention_mask': target_encodings['attention_mask'].squeeze()
        }

# Training Loop with Dynamic Learning Rate Adjustment
def train_model(train_loader, val_loader, model, optimizer, scheduler, num_epochs):
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_batch_count = 0
        
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            decoder_attention_mask = batch['decoder_attention_mask'].to(DEVICE)
            
            outputs = model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask)
            
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_batch_count += 1
        
        avg_train_loss = train_loss / train_batch_count
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_batch_count = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                decoder_attention_mask = batch['decoder_attention_mask'].to(DEVICE)
                
                outputs = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              labels=labels,
                              decoder_attention_mask=decoder_attention_mask)
                
                val_loss += outputs.loss.item()
                val_batch_count += 1
        
        avg_val_loss = val_loss / val_batch_count
        val_losses.append(avg_val_loss)
        
        # Step scheduler at the end of each epoch
        scheduler.step()
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Average training loss: {avg_train_loss:.4f}")
        print(f"Average validation loss: {avg_val_loss:.4f}")
        
    return train_losses, val_losses

# Answer Generation Function
def generate_answer(model, tokenizer, question, context, max_length=128):
    input_text = f"generate answer: {question} [SEP] {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(DEVICE)
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Main Function
def main():
    # Load and prepare data
    data = prepare_data('/kaggle/input/quizard-dataset-3000/Quizard_custom_dataset.csv')
    
    # Split data
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # Create datasets
    train_dataset = AnswerGenerationDataset(TOKENIZER, train_data, Q_LEN, T_LEN)
    val_dataset = AnswerGenerationDataset(TOKENIZER, val_data, Q_LEN, T_LEN)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Initialize optimizer and scheduler
    optimizer = Adam(MODEL.parameters(), lr=LEARNING_RATE)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.9)  # StepLR scheduler for dynamic LR adjustment
    
    # Train the model
    train_losses, val_losses = train_model(train_loader, val_loader, MODEL, optimizer, scheduler, EPOCHS)
    
    # Plot training results
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, EPOCHS+1), train_losses, marker='o', label='Train Loss')
    plt.plot(range(1, EPOCHS+1), val_losses, marker='o', label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Save the model
    MODEL.save_pretrained(f'{OUTPUT_DIR}{OUTPUT_MODEL_NAME}')
    TOKENIZER.save_pretrained(f'{OUTPUT_DIR}{OUTPUT_MODEL_NAME}')
    
    # Test the model with a sample
    sample_question = "What happens during photosynthesis?"
    sample_context = "The process of photosynthesis occurs in the chloroplasts of plant cells. During photosynthesis, light energy is converted into chemical energy, stored as glucose, and oxygen is released."
    
    generated_answer = generate_answer(MODEL, TOKENIZER, sample_question, sample_context)
    print("\nSample Generation:")
    print(f"Question: {sample_question}")
    print(f"Context: {sample_context}")
    print(f"Generated Answer: {generated_answer}")

if __name__ == "__main__":
    main()


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
# Save the fine-tuned model and tokenizer locally
MODEL.save_pretrained("T5-AnswerGeneration-Quizard-5")
TOKENIZER.save_pretrained("T5-AnswerGeneration-Quizard-5")

In [None]:
# Replace 'your_token' with your actual Hugging Face token
MODEL.push_to_hub("T5-AnswerGeneration-Quizard-5", use_auth_token=hf_token, use_temp_dir=False)
TOKENIZER.push_to_hub("T5-AnswerGeneration-Quizard-5", use_auth_token=hf_token, use_temp_dir=False)

In [None]:
#     Context="The Great Barrier Reef is the world's largest coral reef system, located in the Coral Sea, off the coast of Queensland, Australia. It is composed of over 2,900 individual reefs and 900 islands stretching over 2,300 kilometers. The reef is known for its biodiversity, hosting countless marine species, and is a popular destination for snorkeling and diving enthusiasts. However, it faces threats from climate change, overfishing, and pollution.",
#     Question="What are some of the major threats faced by the Great Barrier Reef?",
#     Answer=""

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
from huggingface_hub import login

# Replace 'your_token' with your actual Hugging Face token
login(token=hf_token)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the model and tokenizer from Hugging Face Hub
model_name = "aayeshanakarmi/T5-AnswerGeneration-Quizard-5"  # Replace with your repo name
tokenizer = T5Tokenizer.from_pretrained(model_name, use_auth_token=True)
model = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=True)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function for generating the answer
def generate_answer(question, context, max_length=128):
    input_text = f"generate answer: {question} [SEP] {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = inputs.to(device)
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with an example
sample_question = "What are some of the major threats faced by the Great Barrier Reef?"
sample_context = "The Great Barrier Reef is the world's largest coral reef system, located in the Coral Sea, off the coast of Queensland, Australia. It is composed of over 2,900 individual reefs and 900 islands stretching over 2,300 kilometers. The reef is known for its biodiversity, hosting countless marine species, and is a popular destination for snorkeling and diving enthusiasts. However, it faces threats from climate change, overfishing, and pollution."

generated_answer = generate_answer(sample_question, sample_context)
print("Generated Answer:", generated_answer)