# Setup and Imports

In [None]:
import pandas as pd
import json
import os
import uuid
import time
import requests
import re
from sklearn.model_selection import train_test_split
from kaggle_secrets import UserSecretsClient
from datetime import datetime, timedelta


# Define Paths and Constants

In [None]:
INPUT_DIR = "/kaggle/input/assignementdataset"
CACHE_DIR = "/kaggle/working/qa_cache/"
os.makedirs(CACHE_DIR, exist_ok=True)

user_secrets = UserSecretsClient()
OPENROUTER_API_KEY = user_secrets.get_secret("OPENROUTER_API_KEY_Mixtral")
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_CONFIG = ["mistralai/mixtral-8x7b-instruct"]

RATE_LIMIT_RPM = 30
request_timestamps = []


# Data Loading Utilities

In [None]:
def load_papers(directory):
    papers = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename), 'r') as f:
                try:
                    paper = json.load(f)
                    paper["paper_id"] = filename.replace(".json", "").replace(".grobid.tei", "")
                    papers.append(paper)
                except json.JSONDecodeError as e:
                    print(f"Error loading {filename}: {e}")
    return papers

def load_cached_qa(paper_id):
    cache_file = os.path.join(CACHE_DIR, f"{paper_id}.json")
    if os.path.exists(cache_file):
        with open(cache_file, 'r') as f:
            return json.load(f)
    return None

def save_cached_qa(paper_id, qa_pairs):
    cache_file = os.path.join(CACHE_DIR, f"{paper_id}.json")
    with open(cache_file, 'w') as f:
        json.dump(qa_pairs, f)


# Input/Output Debugging Helpers

In [None]:
def save_paper_input(paper_id, paper_data):
    input_file = os.path.join(CACHE_DIR, f"{paper_id}_input.json")
    with open(input_file, 'w') as f:
        json.dump(paper_data, f, indent=2)


# Text Cleaning & Extraction Utilities

In [None]:
def clean_response(content):
    content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
    content = content.replace('\n', ' ').replace('\r', ' ')
    content = re.sub(r'\s+', ' ', content)
    return content.strip()

def extract_qa_from_string(content):
    try:
        json_match = re.search(r'\[\s*\{.*?\}\s*\]', content, re.DOTALL)
        if json_match:
            qa_pairs = json.loads(json_match.group(0))
            if isinstance(qa_pairs, list):
                return qa_pairs
        json_match = re.search(r'\{.*?\}', content, re.DOTALL)
        if json_match:
            qa_pair = json.loads(json_match.group(0))
            if isinstance(qa_pair, dict) and "question" in qa_pair:
                return [qa_pair]
        return []
    except json.JSONDecodeError:
        return []


# Fallback QA Generator

In [None]:
def generate_fallback_qa(title, abstract):
    abstract = abstract if abstract != "No abstract" else f"The study titled '{title}' investigates medical research topics."
    questions = [
        f"What is the primary objective of the study titled '{title}'?",
        f"What key issue does the study titled '{title}' address?",
        f"What is a main finding of the study titled '{title}'?",
        f"What methodology is used in the study titled '{title}'?",
        f"What are the implications of the study titled '{title}'?"
    ]
    qa_pairs = [
        {
            "question": q,
            "answer": abstract[:200],
            "context_location": "Abstract",
            "context": abstract[:100]
        } for q in questions
    ]
    return qa_pairs


# Rate Limiting Logic

In [None]:
def enforce_rate_limit(paper_id):
    global request_timestamps
    now = datetime.now()
    request_timestamps = [t for t in request_timestamps if now - t < timedelta(minutes=1)]
    if len(request_timestamps) >= RATE_LIMIT_RPM:
        wait_time = 60 - (now - request_timestamps[0]).total_seconds()
        if wait_time > 0:
            print(f"Rate limit reached for {paper_id}. Waiting for {wait_time:.1f} seconds.")
            time.sleep(wait_time)
        request_timestamps = request_timestamps[1:]
    request_timestamps.append(now)


# Core QA Generation Function

In [None]:
def generate_qa_with_openrouter(paper, paper_index, max_retries=3, base_delay=60):
    cached_qa = load_cached_qa(paper["paper_id"])
    if cached_qa and len(cached_qa) == 5:
        print(f"Loaded cached QA for {paper['paper_id']} ({paper_index+1}/43 papers processed)")
        return cached_qa

    title = paper.get("title", "No title")
    abstract = paper.get("abstract", "No abstract")
    if "pdf_parse" in paper and "abstract" in paper["pdf_parse"]:
        abstract = " ".join([item["text"] for item in paper["pdf_parse"]["abstract"] if isinstance(item, dict) and "text" in item])[:500]
    keywords = ", ".join(paper.get("pdf_parse", {}).get("keywords", []))[:100]
    sections = paper.get("pdf_parse", {}).get("body_text", [])
    sections_dict = {item["section"]: item["text"] for item in sections if isinstance(item, dict) and "section" in item and "text" in item}
    sections_str = "\n".join([f"{k}: {v[:400]}" for k, v in sections_dict.items()])[:1500]

    save_paper_input(paper["paper_id"], {"title": title, "abstract": abstract, "keywords": keywords, "sections": sections_dict})

    prompt = f"""You are an expert in medical research. Reason through the paper’s content to identify key insights, focusing on objectives, methods, findings, limitations, and implications. Generate exactly 5 unique question-answer pairs for the given medical research paper. Each pair MUST include:
- A unique question related to the paper’s content (e.g., objectives, methods, findings, limitations, implications).
- An answer based on the paper, derived from your reasoning.
- The context location (e.g., "Abstract", "Objectives", "Search methods").
- The context text used for the answer.

Paper Data:
Title: {title}
Abstract: {abstract}
Keywords: {keywords}
Sections:
{sections_str}

Output MUST be a valid JSON array with exactly 5 objects, containing only the specified fields. Return only the JSON array, with no additional text, keys like "questions", or invalid characters:
[
  {{
    "question": "Question text",
    "answer": "Answer text",
    "context_location": "Section name",
    "context": "Context text"
  }},
  ...
]
"""

    fallback_prompt = f"""You are an expert in medical research. Reason through the abstract to identify key insights, focusing on objectives, findings, and implications. Generate exactly 5 unique question-answer pairs for the medical paper titled "{title}" with abstract: {abstract}. Each pair must include:
- A unique question about the paper’s content (e.g., objectives, findings, implications).
- An answer from the abstract, based on your reasoning.
- Context location: "Abstract".
- Context text from the abstract.

Return only a valid JSON array with exactly 5 objects, no extra text or invalid characters:
[
  {{
    "question": "Question text",
    "answer": "Answer text",
    "context_location": "Abstract",
    "context": "Context text"
  }},
  ...
]
"""

    ultra_simple_prompt = f"""You are an expert in medical research. Reason through the abstract to identify key insights. Generate exactly 5 unique question-answer pairs for the paper titled "{title}" with abstract: {abstract}. Each pair must include:
- A unique question about the paper’s content.
- An answer from the abstract, based on your reasoning.
- Context location: "Abstract".
- Context text from the abstract.

Return: [
  {{"question": "Question 1", "answer": "Answer 1", "context_location": "Abstract", "context": "Context text"}},
  {{"question": "Question 2", "answer": "Answer 2", "context_location": "Abstract", "context": "Context text"}},
  {{"question": "Question 3", "answer": "Answer 3", "context_location": "Abstract", "context": "Context text"}},
  {{"question": "Question 4", "answer": "Answer 4", "context_location": "Abstract", "context": "Context text"}},
  {{"question": "Question 5", "answer": "Answer 5", "context_location": "Abstract", "context": "Context text"}}
]
"""

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    model = MODEL_CONFIG[0]
    for prompt_version, current_prompt in enumerate([prompt, fallback_prompt, ultra_simple_prompt], 1):
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": current_prompt}],
            "response_format": {"type": "json_object"},
            "temperature": 0.7,
            "max_tokens": 5000
        }

        for attempt in range(max_retries):
            enforce_rate_limit(paper["paper_id"])
            try:
                response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload)
                response.raise_for_status()
                response_data = response.json()

                log_file = os.path.join(CACHE_DIR, f"{paper['paper_id']}_{model.replace('/', '-')}_prompt{prompt_version}_attempt{attempt+1}_raw_response.json")
                with open(log_file, 'w') as f:
                    json.dump({
                        "status_code": response.status_code,
                        "headers": dict(response.headers),
                        "body": response_data
                    }, f, indent=2)

                content = response_data.get("choices", [{}])[0].get("message", {}).get("content", "")
                if not content:
                    print(f"Empty content in response for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1}, status {response.status_code} ({paper_index+1}/43 papers processed)")
                    continue

                usage = response_data.get("usage", {})
                print(f"Tokens for {paper['paper_id']} with {model}: prompt={usage.get('prompt_tokens', 0)}, completion={usage.get('completion_tokens', 0)} ({paper_index+1}/43 papers processed)")

                content = clean_response(content)
                qa_pairs = json.loads(content)
                if not isinstance(qa_pairs, list):
                    if isinstance(qa_pairs, dict) and "questions" in qa_pairs:
                        qa_pairs = extract_qa_from_string(qa_pairs.get("questions", ""))
                    else:
                        qa_pairs = extract_qa_from_string(content)

                if not isinstance(qa_pairs, list) or len(qa_pairs) == 0:
                    print(f"Invalid QA pairs for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1}: {qa_pairs} ({paper_index+1}/43 papers processed)")
                    continue

                valid_qa = [
                    qa for qa in qa_pairs
                    if isinstance(qa, dict) and all(k in qa for k in ["question", "answer", "context_location", "context"])
                ]
                if len(valid_qa) != 5:
                    print(f"Expected 5 QA pairs, got {len(valid_qa)} for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1} ({paper_index+1}/43 papers processed)")
                    continue

                seen_questions = set()
                unique_qa = []
                for qa in valid_qa:
                    if qa["question"] not in seen_questions:
                        seen_questions.add(qa["question"])
                        unique_qa.append(qa)

                if len(unique_qa) != 5:
                    print(f"Expected 5 unique QA pairs, got {len(unique_qa)} for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1} ({paper_index+1}/43 papers processed)")
                    continue

                save_cached_qa(paper["paper_id"], unique_qa)
                print(f"Success with {model}, prompt {prompt_version} for {paper['paper_id']}: 5 QA pairs ({paper_index+1}/43 papers processed)")
                return unique_qa
            except requests.exceptions.RequestException as e:
                status_code = response.status_code if 'response' in locals() else None
                if status_code == 429 and attempt < max_retries - 1:
                    wait_time = base_delay * (2 ** attempt)
                    retry_after = response.headers.get("Retry-After")
                    if retry_after:
                        wait_time = max(wait_time, int(retry_after))
                    print(f"429 rate limit exceeded for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1}. Waiting for {wait_time} seconds at {datetime.now().strftime('%H:%M:%S')} ({paper_index+1}/43 papers processed)")
                    time.sleep(wait_time)
                else:
                    print(f"Request error for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1}, status {status_code}: {e} ({paper_index+1}/43 papers processed)")
                    continue
            except json.JSONDecodeError as e:
                print(f"JSON error for {paper['paper_id']} with {model}, prompt {prompt_version}, attempt {attempt+1}: {e} ({paper_index+1}/43 papers processed)")
                continue
            time.sleep(2)

    print(f"All attempts failed for {paper['paper_id']} ({paper_index+1}/43 papers processed)")
    qa_pairs = generate_fallback_qa(title, abstract)
    save_cached_qa(paper["paper_id"], qa_pairs)
    print(f"Generated 5 fallback QA pairs for {paper['paper_id']} ({paper_index+1}/43 papers processed)")
    return qa_pairs


# Execution Loop

In [None]:
def process_papers(batch_size=15):
    papers = load_papers(INPUT_DIR)
    print(f"Loaded {len(papers)} papers")
    
    all_qa_pairs = []
    skipped_papers = []
    for i in range(0, len(papers), batch_size):
        batch = papers[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}: {len(batch)} papers")
        for j, paper in enumerate(batch):
            paper_index = i + j
            print(f"Processing {paper['paper_id']} ({paper_index+1}/43 papers processed)")
            qa_pairs = generate_qa_with_openrouter(paper, paper_index)
            if len(qa_pairs) != 5:
                print(f"Failed to generate exactly 5 QA pairs for {paper['paper_id']} ({paper_index+1}/43 papers processed)")
                skipped_papers.append(paper["paper_id"])
                continue
            for qa in qa_pairs:
                all_qa_pairs.append({
                    "question_id": str(uuid.uuid4()),
                    "question": qa["question"],
                    "answer": qa["answer"],
                    "context_location": qa["context_location"],
                    "context": qa["context"],
                    "paper": paper["paper_id"]
                })
            time.sleep(5)  # Reduced per-paper delay
        # Batch delay
        if i + batch_size < len(papers):
            print(f"Batch delay for 60 seconds at {datetime.now().strftime('%H:%M:%S')}")
            time.sleep(60)
    
    # Log skipped papers
    if skipped_papers:
        with open(os.path.join(CACHE_DIR, "skipped_papers.txt"), "w") as f:
            f.write("\n".join(skipped_papers))
        print(f"Skipped papers: {skipped_papers}")
    
    train_df = pd.DataFrame(all_qa_pairs)
    paper_counts = train_df["paper"].value_counts()
    valid_papers = paper_counts[paper_counts == 5].index  # Exactly 5 QA pairs per paper
    train_df = train_df[train_df["paper"].isin(valid_papers)]
    
    if len(train_df) == 0:
        print("No papers with exactly 5 QA pairs. Exiting.")
        return pd.DataFrame(), pd.DataFrame()
    
    try:
        train_data, test_data = train_test_split(
            train_df,
            test_size=0.2,
            random_state=42,
            stratify=train_df["paper"]
        )
    except ValueError as e:
        print(f"Stratification failed: {e}. Splitting without stratification.")
        train_data, test_data = train_test_split(
            train_df,
            test_size=0.2,
            random_state=42
        )
    
    test_df = pd.DataFrame({
        "question_id": test_data["question_id"],
        "question": test_data["question"],
        "paper": test_data["paper"],
        "answer": test_data["answer"],
        "context_location": test_data["context_location"],
        "context": test_data["context"]
    })
    
    train_df.to_csv("/kaggle/working/train_data.csv", index=False)
    test_df.to_csv("/kaggle/working/test_data.csv", index=False)
    
    print(f"Completed processing {len(papers)} papers")
    return train_df, test_df



# Main Exceution

In [None]:
# Execute
if __name__ == "__main__":
    train_df, test_df = process_papers()
    print("Train DataFrame shape:", train_df.shape)
    print("Test DataFrame shape:", test_df.shape)
    print("Test columns:", test_df.columns.tolist())

In [None]:
train=pd.read_csv("/kaggle/input/assignmentqa/train_data(2).csv")
train

In [None]:
test=pd.read_csv("/kaggle/input/assignmentqa/test_data(2).csv")
test