In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip uninstall bitsandbytes -y

!pip install -U bitsandbytes

In [None]:
import bitsandbytes
print(bitsandbytes.__version__)

In [None]:
pip install --upgrade bitsandbytes

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install transformers datasets torch accelerate

In [None]:
!pip install transformers datasets torch pandas evaluate rouge_score

In [None]:
!pip install huggingface_hub

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from transformers  import BitsAndBytesConfig
import torch
from torch.utils.data import DataLoader
from huggingface_hub import login
import torch.nn as nn

# Reading dataest

In [None]:
df = pd.read_csv("/kaggle/input/introducing-quail-a-comprehensive-reading-compre/train.csv")

In [None]:
df_cleaned = df[["context", "question", "answers"]]

In [None]:
df_cleaned = df_cleaned[df_cleaned['question'].str.endswith('?', na=False)]
df_cleaned = df_cleaned.reset_index(drop=True)



In [None]:
df_task1 = df_cleaned.copy()
df_task1["input_text"] = "generate question: " + df_cleaned["context"]
df_task1["target_text"] = "Question: " + df_cleaned["question"] + " Answer: " + df_cleaned["answers"]

In [None]:
df_task1.head()

In [None]:
df_task2 = df_cleaned.copy()
df_task2["input_text"] = "answer question: " + df_cleaned["question"] + " context: " + df_cleaned["context"]
df_task2["target_text"] = df_cleaned["answers"]

In [None]:
df_combined = pd.concat([df_task1[["input_text", "target_text"]], df_task2[["input_text", "target_text"]]], ignore_index=True)

In [None]:
df_cleaned

# Reformatting data

In [None]:
def reformat_df_cleaned(df_cleaned, task):
    df_reformatted = df_cleaned.copy()
    df_reformatted['parsed_answer'] = df_reformatted['answers'].apply(
        lambda x: x.strip("[]").split("'")[1] if x else "not enough information"
    )
    
    if task == "qa_generation":
        df_reformatted['prompt'] = (
            "### Context:\n" + df_reformatted['context'] + "\n\n" +
            "### Instruction:\nGenerate a question and its answer based on the context.\n\n" +
            "### Output:\n**Question**: " + df_reformatted['question'] + "\n" +
            "**Answer**: " + df_reformatted['parsed_answer']
        )
    elif task == "qa_answering":
        df_reformatted['prompt'] = (
            "### Context:\n" + df_reformatted['context'] + "\n\n" +
            "### Question:\n" + df_reformatted['question'] + "\n\n" +
            "### Instruction:\nProvide the answer to the question based on the context.\n\n" +
            "### Answer:\n" + df_reformatted['parsed_answer']
        )
    else:
        raise ValueError("Task must be 'qa_generation' or 'qa_answering'")
    
    df_reformatted = df_reformatted.drop(columns=['parsed_answer'])
    return df_reformatted

In [None]:
train_df, val_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)

In [None]:
train_df, val_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)
train_df_qa_gen = reformat_df_cleaned(train_df, task="qa_generation")
val_df_qa_gen = reformat_df_cleaned(val_df, task="qa_generation")
train_df_qa_answer = reformat_df_cleaned(train_df, task="qa_answering")
val_df_qa_answer = reformat_df_cleaned(val_df, task="qa_answering")

In [None]:
train_dataset_qa_gen = Dataset.from_pandas(train_df_qa_gen)
val_dataset_qa_gen = Dataset.from_pandas(val_df_qa_gen)
train_dataset_qa_answer = Dataset.from_pandas(train_df_qa_answer)
val_dataset_qa_answer = Dataset.from_pandas(val_df_qa_answer)

In [None]:
access_token = os.getenv("HF_TOKEN")
login(access_token)

# Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

In [None]:
train_qa_gen_tokenized = train_dataset_qa_gen.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "context", "question", "answers"]
)
val_qa_gen_tokenized = val_dataset_qa_gen.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "context", "question", "answers"]
)
train_qa_answer_tokenized = train_dataset_qa_answer.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "context", "question", "answers"]
)
val_qa_answer_tokenized = val_dataset_qa_answer.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "context", "question", "answers"]
)

In [None]:
train_qa_gen_tokenized.set_format("torch")
val_qa_gen_tokenized.set_format("torch")
train_qa_answer_tokenized.set_format("torch")
val_qa_answer_tokenized.set_format("torch")

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

# Quantization

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    quantization_config=bnb_config,
    device_map="auto" 
)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
batch_size = 1
gradient_accumulation_steps = 16
num_train_epochs = 5
learning_rate = 2e-4
device = torch.device("cuda:0")
output_dir = "./mistral_finetuned"

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
train_dataloader_qa_gen = DataLoader(
    train_qa_gen_tokenized,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
val_dataloader_qa_gen = DataLoader(
    val_qa_gen_tokenized,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
train_dataloader_qa_answer = DataLoader(
    train_qa_answer_tokenized,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
val_dataloader_qa_answer = DataLoader(
    val_qa_answer_tokenized,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

# Training model

In [None]:
def train_model(model, train_dataloader, val_dataloader, output_dir, task_name, repo_name, access_token):
    model.train()
    total_steps = len(train_dataloader) * num_train_epochs
    step = 0

    for epoch in range(num_train_epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(train_dataloader):
            try:
                input_ids = batch["input_ids"].to(model.device)
                attention_mask = batch["attention_mask"].to(model.device)
                labels = batch["labels"].to(model.device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss = loss / gradient_accumulation_steps 
                total_loss += loss.item()

                loss.backward()

                if (batch_idx + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    step += 1
                    if step % 20 == 0:
                        print(f"Epoch {epoch+1}, Step {step}/{total_steps}, Loss: {total_loss / (batch_idx + 1):.4f}")

                if (batch_idx + 1) == len(train_dataloader):
                    model.eval()
                    val_loss = 0
                    with torch.no_grad():
                        for val_batch in val_dataloader:
                            input_ids = val_batch["input_ids"].to(device)
                            attention_mask = val_batch["attention_mask"].to(device)
                            labels = val_batch["labels"].to(device)
                            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                            val_loss += outputs.loss.item()
                    val_loss /= len(val_dataloader)
                    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}")
                    model.train()

            except RuntimeError as e:
                print(f"Error during training: {e}")
                torch.cuda.empty_cache()  
                continue

        local_save_path = f"{output_dir}/{task_name}"
        model.save_pretrained(local_save_path)
        tokenizer.save_pretrained(local_save_path)
        print(f"Model and tokenizer saved locally to {local_save_path}")

        try:
            model.eval()
            model.push_to_hub(repo_name, token=access_token)
            tokenizer.push_to_hub(repo_name, token=access_token)
            print(f"Model and tokenizer successfully pushed to Hugging Face Hub: https://huggingface.co/{repo_name}")
        except Exception as e:
            print(f"Error pushing to Hugging Face Hub: {e}")
            print("Continuing without pushing to Hub...")

repo_name_qa_gen = "selsayed2003/mistral_qa_gen" 
repo_name_qa_answer = "selsayed2003/mistral_qa_answer"

In [None]:
print("Training QA Generation Model...")
train_model(model, train_dataloader_qa_gen, val_dataloader_qa_gen, output_dir, "mistral_qa_gen", repo_name_qa_gen, access_token)

# Loading model from hugging 

In [None]:
!pip install evaluate

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"  
adapter_id = "selsayed2003/mistral_qa_gen"

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto")
model = PeftModel.from_pretrained(model, adapter_id)

tokenizer = AutoTokenizer.from_pretrained(adapter_id)

# Evaluating model using cosine Similarity

In [None]:
import torch
import torch.nn.functional as F

def cosine_sim_torch(pred_emb: torch.Tensor, ref_emb: torch.Tensor) -> float:
    pred_norm = F.normalize(pred_emb, p=2, dim=-1)
    ref_norm = F.normalize(ref_emb, p=2, dim=-1)
    return torch.sum(pred_norm * ref_norm).item()


In [None]:
from tqdm import tqdm
import torch.nn.functional as F

def test_qa_gen_model_full(model, tokenizer, raw_dataset, dataloader, device):
    model.eval()
    results = []
    total = 0
    total_score = 0.0

    for i, batch in enumerate(tqdm(dataloader, desc="Evaluating QA Generation Model")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=64,
                temperature=0.7
            )

        gen_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        labels = batch["labels"]
        labels[labels == -100] = tokenizer.pad_token_id
        ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        for j, (gen, ref) in enumerate(zip(gen_texts, ref_texts)):
            idx = i * dataloader.batch_size + j
            if idx >= len(raw_dataset):
                continue
            example = raw_dataset[idx]

            gen_ids = tokenizer(gen, return_tensors="pt", truncation=True, padding=True)["input_ids"].float()
            ref_ids = tokenizer(ref, return_tensors="pt", truncation=True, padding=True)["input_ids"].float()

            max_len = max(gen_ids.shape[-1], ref_ids.shape[-1])
            gen_ids = F.pad(gen_ids, (0, max_len - gen_ids.shape[-1]))
            ref_ids = F.pad(ref_ids, (0, max_len - ref_ids.shape[-1]))

            score = cosine_sim_torch(gen_ids.squeeze(), ref_ids.squeeze())

            total += 1
            total_score += score

            results.append({
                "context": example["context"],
                "question": example["question"],
                "reference_answer": example["answers"],
                "generated_answer": gen.strip(),
                "cosine_similarity": score
            })

    avg_score = total_score / total if total > 0 else 0.0
    print(f"Average Cosine Similarity over {total} examples: {avg_score:.4f}")
    return results


In [None]:
results = test_qa_gen_model_full(
    model=model,
    tokenizer=tokenizer,
    raw_dataset=val_dataset_qa_gen,
    dataloader=val_dataloader_qa_gen,
    device=model.device,
)

In [None]:
for r in range(10,15,1):
    print("Context:", results[r]["question"])
    print("Question:", results[r]["question"])
    print("Reference:", results[r]["reference_answer"])
    print("Generated:", results[r]["generated_answer"])
    print(f"Cosine Similarity: {results[r]['cosine_similarity']:.4f}")