In [None]:
!pip install faiss-cpu

In [None]:
!pip install -U bitsandbytes -q

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_API_KEY")



In [None]:
from huggingface_hub import login

In [None]:
login(secret_value_0)

In [None]:
!pip install evaluate


# 1. Imports and Libraries

In [None]:
import pandas as pd
import glob
import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model
import evaluate

# 2. Loading and Extracting Data from JSON Papers

In [None]:

def load_papers(directory="/kaggle/input/assignementdataset"):
    json_files = [f for f in Path(directory).glob("*.json") if f.is_file()]
    all_data = []
    for file in tqdm(json_files, desc="Loading JSON files"):
        with open(file, 'r') as f:
            try:
                doc = json.load(f)
                text = extract_text_from_json(doc)
                all_data.append({"file": file.name, "text": text})
            except json.JSONDecodeError as e:
                print(f"Error parsing {file.name}: {e}")
                continue
    return pd.DataFrame(all_data)

def extract_text_from_json(doc):
    if "body_text" in doc:
        return " ".join([section["text"] for section in doc["body_text"] if "text" in section])
    elif isinstance(doc, dict):
        return " ".join(str(v) for v in doc.values() if isinstance(v, str))
    return ""


# 3. Splitting the QA Dataset into Training and Testing

In [None]:
def split_qa_dataset(qa_df, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(qa_df, test_size=test_size, random_state=random_state)
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

# 4. Building the Vector Retriever for Context Matching

In [None]:
def build_retriever(df, model_name="all-MiniLM-L6-v2"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model_name, device=device)
    texts = df["text"].fillna("").tolist()
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=16)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings.astype(np.float32))
    return model, index

# 5. Preparing the QA Dataset for Fine-Tuning

In [None]:
def prepare_finetuning_dataset(qa_df):
    def format_example(row):
        prompt = f"""
### Question: {row['question']}
### Context: {row['context'][:2000]}
### Instructions:
Answer: {row['answer']}
Context Snippet: {row['context']}
Location: {row['context_location']}
"""
        return {"text": prompt}
    
    dataset = Dataset.from_pandas(qa_df)
    dataset = dataset.map(format_example)
    return dataset

# 6. Fine-Tuning the Mistral Model with LoRA

In [None]:
# Fine-tune Mistral model
def finetune_mistral(qa_train_df, output_dir="mistral_finetuned"):
    model_name = "mistralai/Mistral-7B-Instruct-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
    
    # Apply LoRA for efficient fine-tuning
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    
    # Prepare dataset
    train_dataset = prepare_finetuning_dataset(qa_train_df)
    
    # Tokenize dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        evaluation_strategy="no",
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    return model, tokenizer


# 7. Retrieving and Generating Answers with the Fine-Tuned Model

In [None]:
# Retrieve and generate answer with fine-tuned model
def retrieve_and_answer(question, paper_df, retriever_model, index, finetuned_model, tokenizer, max_context_length=2000):
    question_embedding = retriever_model.encode([question])[0]
    distances, indices = index.search(np.array([question_embedding]).astype(np.float32), k=1)
    paper_idx = indices[0][0]
    relevant_paper = paper_df.iloc[paper_idx]
    context = relevant_paper["text"][:max_context_length]
    
    prompt = f"""
You are a highly capable AI assistant trained in reproductive medicine literature. Your task is to reason carefully and answer the question based on the provided academic context.

### Instructions:
- Read the context below.
- Infer and construct the most accurate answer possible.
- Identify and copy the exact sentence or paragraph that supports your answer.
- Specify its location (e.g., sentence number or paragraph number).

### Format:
Answer: <your well-reasoned answer>
Context Snippet: <exact supporting sentence or paragraph from the context>
Location: <where the snippet is found — e.g., "2nd sentence", "3rd paragraph", etc.>

### Input:
Question: {question}
Context: {context}
"""
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True).to("cuda")
    outputs = finetuned_model.generate(**inputs, max_new_tokens=500, temperature=0.2)
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse the output
    answer = context_snippet = location = "Not found"
    for line in response_text.split("\n"):
        if line.strip().lower().startswith("answer:"):
            answer = line.split(":", 1)[1].strip()
        elif line.strip().lower().startswith("context snippet:"):
            context_snippet = line.split(":", 1)[1].strip()
        elif line.strip().lower().startswith("location:"):
            location = line.split(":", 1)[1].strip()
    
    return {
        "paper_title": relevant_paper["file"],
        "answer": answer,
        "context_snippet": context_snippet,
        "location": location,
        "paper_idx": paper_idx
    }


# 8. Evaluating Model Predictions

In [None]:
# Evaluate predictions against true labels
def evaluate_predictions(test_df, paper_df, retriever_model, index, finetuned_model, tokenizer):
    results = []
    exact_match = evaluate.load("exact_match")
    
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating test set"):
        question = row["question"]
        true_labels = {
            "paper_title": row["paper"],
            "answer": row["answer"],
            "context_snippet": row["context"],
            "location": row["context_location"]
        }
        
        pred = retrieve_and_answer(question, paper_df, retriever_model, index, finetuned_model, tokenizer)
        
        # Evaluate
        paper_correct = pred["paper_title"] == true_labels["paper_title"]
        answer_em = exact_match.compute(predictions=[pred["answer"]], references=[true_labels["answer"]])["exact_match"]
        context_em = exact_match.compute(predictions=[pred["context_snippet"]], references=[true_labels["context_snippet"]])["exact_match"]
        location_em = exact_match.compute(predictions=[pred["location"]], references=[str(true_labels["location"])])["exact_match"]
        
        results.append({
            "question": question,
            "pred_paper": pred["paper_title"],
            "true_paper": true_labels["paper_title"],
            "paper_correct": paper_correct,
            "pred_answer": pred["answer"],
            "true_answer": true_labels["answer"],
            "answer_em": answer_em,
            "pred_context": pred["context_snippet"],
            "true_context": true_labels["context_snippet"],
            "context_em": context_em,
            "pred_location": pred["location"],
            "true_location": true_labels["location"],
            "location_em": location_em
        })
    
    results_df = pd.DataFrame(results)
    print("\nEvaluation Summary:")
    print(f"Paper Retrieval Accuracy: {results_df['paper_correct'].mean():.4f}")
    print(f"Answer Exact Match: {results_df['answer_em'].mean():.4f}")
    print(f"Context Exact Match: {results_df['context_em'].mean():.4f}")
    print(f"Location Exact Match: {results_df['location_em'].mean():.4f}")
    
    return results_df


# 9. Main Execution

In [None]:

if __name__ == "__main__":

    print("Loading paper dataset...")
    paper_df = load_papers()
    print(f"Loaded {len(paper_df)} papers.")
    
    print("Loading QA dataset...")
    qa_df = pd.read_csv("/kaggle/input/assignmentqa/train_data(2).csv")  
    print(f"Loaded {len(qa_df)} QA pairs.")
    
    print("Splitting QA dataset...")
    qa_train_df, qa_test_df = split_qa_dataset(qa_df)
    print(f"Training set: {len(qa_train_df)} QA pairs")
    print(f"Test set: {len(qa_test_df)} QA pairs")
    
    print("Fine-tuning Mistral model...")
    finetuned_model, tokenizer = finetune_mistral(qa_train_df)
   
    print("Building retriever...")
    retriever_model, index = build_retriever(paper_df)
 
    print("Evaluating on test set...")
    results_df = evaluate_predictions(qa_test_df, paper_df, retriever_model, index, finetuned_model, tokenizer)
   
    results_df.to_csv("evaluation_results.csv", index=False)
    print("Results saved to evaluation_results.csv")