# 1. Setup & Import Libraries

In [None]:
import pandas as pd
import glob
import json
import numpy as np
import torch

from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
import requests
from sklearn.model_selection import train_test_split
from pathlib import Path


 # 2. OpenRouter API Configuration


In [None]:

OPENROUTER_API_KEY = "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"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"


# 3. Load and Process Papers

In [None]:

def load_papers(directory="/kaggle/input/assignementdataset"):
    json_files = [f for f in Path(directory).glob("*.json") if f.is_file()]
    all_data = []
    for file in tqdm(json_files, desc="Loading JSON files"):
        with open(file, 'r') as f:
            try:
                doc = json.load(f)
                text = extract_text_from_json(doc)
                all_data.append({"file": file.name, "text": text})
            except json.JSONDecodeError as e:
                print(f"Error parsing {file.name}: {e}")
                continue
    return pd.DataFrame(all_data)

def extract_text_from_json(doc):
    if "body_text" in doc:
        return " ".join([section["text"] for section in doc["body_text"] if "text" in section])
    elif isinstance(doc, dict):
        return " ".join(str(v) for v in doc.values() if isinstance(v, str))
    return ""


# 4. Split Dataset Into Train and Test

In [None]:

def split_dataset(df, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)


# 5. Build Sentence Transformer Retriever

In [None]:

def build_retriever(df, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    texts = df["text"].fillna("").tolist()
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=16)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings.astype(np.float32))
    return model, index


# 6. Retrieval and Answer Generation

In [None]:

def retrieve_and_answer(question, df, model, index, max_context_length=2000):
    question_embedding = model.encode([question])[0]
    distances, indices = index.search(np.array([question_embedding]).astype(np.float32), k=1)
    paper_idx = indices[0][0]
    relevant_paper = df.iloc[paper_idx]
    context = relevant_paper["text"][:max_context_length]

    # Enhanced prompt for better reasoning
    prompt = f"""
You are a highly capable AI assistant trained in reproductive medicine literature. Your task is to reason carefully and answer the question based on the provided academic context. Think step-by-step if needed.

### Instructions:
- Read the context below.
- Infer and construct the most accurate answer possible, even if it's not word-for-word.
- Then identify and copy the exact sentence or paragraph that supports your answer.
- Finally, specify its location (e.g., sentence number or paragraph number).

### Format:
Answer: <your well-reasoned answer>
Context Snippet: <exact supporting sentence or paragraph from the context>
Location: <where the snippet is found — e.g., "2nd sentence", "3rd paragraph", etc.>

### Input:
Question: {question}
Context: {context}
"""

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "mistralai/mixtral-8x7b-instruct",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
        "max_tokens": 500
    }

    try:
        response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        response_data = response.json()
        response_text = response_data["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"API request failed: {e}")
        return {
            "paper_title": relevant_paper["file"],
            "answer": "Error",
            "context_snippet": "Error",
            "location": "Error",
            "paper_idx": paper_idx
        }

    # Parse the LLM output
    answer = context_snippet = location = "Not found"
    for line in response_text.split("\n"):
        if line.strip().lower().startswith("answer:"):
            answer = line.split(":", 1)[1].strip()
        elif line.strip().lower().startswith("context snippet:"):
            context_snippet = line.split(":", 1)[1].strip()
        elif line.strip().lower().startswith("location:"):
            location = line.split(":", 1)[1].strip()

    return {
        "paper_title": relevant_paper["file"],
        "answer": answer,
        "context_snippet": context_snippet,
        "location": location,
        "paper_idx": paper_idx
    }


# 7 Evaluate the Retrieval

In [None]:

def evaluate_retrieval(test_df, model, index, question, expected_idx=None):
    result = retrieve_and_answer(question, test_df, model, index)
    print("\nEvaluation Results:")
    print(f"Question: {question}")
    print(f"Retrieved Paper: {result['paper_title']} (Index: {result['paper_idx']})")
    print(f"Answer: {result['answer']}")
    print(f"Context Snippet: {result['context_snippet']}")
    print(f"Location: {result['location']}")
    if expected_idx is not None:
        print(f"Correct Retrieval: {result['paper_idx'] == expected_idx}")
    return result


# 8. Main Execution - Loading Data and Running the Model

In [None]:

if __name__ == "__main__":
    print("Loading papers...")
    df = load_papers()
    print(f"Loaded {len(df)} papers.")

    print("Splitting dataset...")
    train_df, test_df = split_dataset(df, test_size=0.2)
    print(f"Training set: {len(train_df)} papers")
    print(f"Test set: {len(test_df)} papers")

    print("Building retriever...")
    model, index = build_retriever(train_df)

    custom_question = input("\nEnter your question: ")
    print("Processing question...")

    result = retrieve_and_answer(custom_question, train_df, model, index)

    print("\nResults:")
    print(f"Most Relevant Paper: {result['paper_title']}")
    print(f"Answer: {result['answer']}")
    print(f"Context Snippet: {result['context_snippet']}")
    print(f"Location: {result['location']}")


# 9. Input and Running the Model

In [None]:

custom_question = "What is the effect of environmental factors on fertility rates?"
result = retrieve_and_answer(custom_question, train_df, model, index)

# Output results
print(f"Most Relevant Paper: {result['paper_title']}")
print(f"Answer: {result['answer']}")
print(f"Context Snippet: {result['context_snippet']}")
print(f"Location: {result['location']}")
