# Install Required Libraries

In [None]:
%pip install ipykernel pandas numpy faiss-cpu openai python-dotenv lxml beautifulsoup4 sentence-transformers


In [None]:
%pip install kagglehub[pandas-datasets]


# Imports

In [None]:
import pandas as pd
import numpy as np
import json
import os
from typing import List, Dict, Tuple
from dotenv import load_dotenv
import re

# Vector store and embeddings
import faiss
from sentence_transformers import SentenceTransformer

# LLM
from openai import OpenAI

# Kaggle
import kagglehub
from kagglehub import KaggleDatasetAdapter

load_dotenv()


# Configuration

In [5]:
CHUNK_SIZE = 300  # Words per chunk
TOP_K = 3  # Number of chunks to retrieve
SAMPLE_SIZE = 500  # Number of movies to use (adjust for your needs)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # os.getenv("OPENAI_API_KEY") use this to get data from .env

client = OpenAI(api_key=OPENAI_API_KEY)


# Load & Preprocess Data

In [None]:
print("Loading movie plots dataset...")

path = kagglehub.dataset_download("jrobischon/wikipedia-movie-plots")
file_path = os.path.join(path, "wiki_movie_plots_deduped.csv")

df = pd.read_csv(file_path, encoding='latin-1', on_bad_lines='skip', engine='python')

# Handle missing values
df = df.dropna(subset=['Title', 'Plot'])

# Sample data if needed
df = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).reset_index(drop=True)

print(f"Loaded {len(df)} movies")
print(f"Sample titles: {df['Title'].head(3).tolist()}")


# Chunking Function

In [None]:
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
    """Split text into chunks by word count."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk.strip():
            chunks.append(chunk)
    return chunks

# Create chunks with metadata
chunks_list = []
chunk_metadata = []

for idx, row in df.iterrows():
    title = row['Title']
    plot = row['Plot']
    chunks = chunk_text(plot)

    for chunk_idx, chunk in enumerate(chunks):
        chunks_list.append(chunk)
        chunk_metadata.append({
            'title': title,
            'chunk_idx': chunk_idx,
            'movie_idx': idx
        })

print(f"Total chunks created: {len(chunks_list)}")
print(f"Sample chunk: {chunks_list[0][:150]}...")


# Embed & Store with Sentence Transformers

In [None]:
print("\nCreating embeddings with Sentence Transformers...")

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & lightweight

# Encode all chunks
embeddings = model.encode(chunks_list, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

# Store in FAISS with cosine similarity (better for embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
faiss.normalize_L2(embeddings)
index.add(embeddings)

print(f"FAISS index created with {index.ntotal} vectors")

# Store model for later queries
model_for_queries = model


# Retrieval Function

In [12]:
def retrieve_chunks(query: str, top_k: int = TOP_K) -> Dict:
    """Retrieve with Sentence Transformers."""

    # Embed query
    query_embedding = model_for_queries.encode([query])[0]
    query_embedding = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_embedding)

    # Search
    distances, indices = index.search(query_embedding, top_k)

    results = {
        'chunks': [chunks_list[i] for i in indices[0]],
        'metadata': [chunk_metadata[i] for i in indices[0]],
        'distances': distances[0].tolist(),  # Now similarity scores (higher = better)
        'indices': indices[0].tolist()
    }

    results['retrieval_quality'] = evaluate_retrieval(
        results['chunks'],
        query,
        results['distances']
    )

    return results

def evaluate_retrieval(chunks: List[str], query: str, distances: List[float]) -> Dict:
    """
    Evaluate retrieval quality.

    Metrics:
    - proximity_score: Lower distance = better (0-1 scale)
    - keyword_coverage: % of query words in retrieved chunks
    - chunk_diversity: Are chunks from different movies?
    """
    # Proximity score (inverse of L2 distance, normalized)
    avg_distance = np.mean(distances)
    proximity_score = 1 / (1 + avg_distance)  # Sigmoid-like transformation

    # Keyword coverage
    query_words = set(re.findall(r'\w+', query.lower()))
    chunk_text = ' '.join(chunks).lower()
    covered_words = query_words & set(re.findall(r'\w+', chunk_text))
    keyword_coverage = len(covered_words) / len(query_words) if query_words else 0

    # Overall retrieval score
    overall_score = (proximity_score * 0.4 + keyword_coverage * 0.6)

    return {
        'proximity_score': round(proximity_score, 3),
        'keyword_coverage': round(keyword_coverage, 3),
        'overall_retrieval_score': round(overall_score, 3),
        'quality_label': 'Good' if overall_score > 0.6 else 'Fair' if overall_score > 0.4 else 'Poor'
    }


# LLM Answer Generation with Evaluation

In [13]:
def generate_answer(query: str, contexts: List[str]) -> Dict:
    """
    Generate answer using LLM with quality evaluation.
    """
    context_text = "\n\n".join([f"[Context {i+1}]: {ctx}" for i, ctx in enumerate(contexts)])

    prompt = f"""Based on the following movie plot contexts, answer the question concisely and accurately.

    Question: {query}

    Contexts:
    {context_text}

    Provide a clear, factual answer. If the contexts don't contain relevant information, say so.
    Output Format:
        Responses must be valid JSON per RFC8259. Do not change keys or structure. Format:
            "answer": "natural language answer"
            "reasoning": "short explanation of how the answer was formed, add movie name as reference, do not add context number"

    Example Output
    {{
      "answer": "The movie *2001: A Space Odyssey* features an artificial intelligence system called HAL 9000.",
      "reasoning": "The question asked about AI. I searched the plots, found '2001: A Space Odyssey' with HAL 9000, and used it to form the answer."
    }}

    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        temperature=0.3
    )

    result = response.choices[0].message.content
    clean_result = result.replace("```json", "").replace("```", "")
    json_result = json.loads(clean_result)

    # Evaluate answer quality
    quality = evaluate_answer(json_result['answer'], contexts, query)

    return {
        'answer': json_result['answer'],
        'reasoning' : json_result['reasoning'],
        'answer_quality': quality
    }

def evaluate_answer(answer: str, contexts: List[str], query: str) -> Dict:
    """
    Evaluate answer quality using heuristics.

    Metrics:
    - grounding: Does answer reference the contexts?
    - length_appropriateness: Is answer concise but complete?
    - relevance: Does answer address the query?
    - confidence: Presence of hedging language
    """
    context_combined = ' '.join(contexts).lower()
    answer_lower = answer.lower()

    # 1. Grounding: Check if answer uses words from contexts
    answer_words = set(re.findall(r'\w+', answer_lower))
    context_words = set(re.findall(r'\w+', context_combined))
    grounding_score = len(answer_words & context_words) / len(answer_words) if answer_words else 0

    # 2. Length appropriateness (100-300 chars is good)
    length_score = 1.0 if 50 <= len(answer) <= 500 else 0.5

    # 3. Relevance: Check if query words are in answer
    query_words = set(re.findall(r'\w+', query.lower()))
    relevance_score = len(query_words & answer_words) / len(query_words) if query_words else 0

    # 4. Confidence: Check for hedging language
    hedging_words = ['might', 'maybe', 'possibly', 'unclear', 'not found', 'no information']
    has_hedging = any(word in answer_lower for word in hedging_words)
    confidence_score = 0.7 if has_hedging else 0.95

    # Overall score
    overall = (grounding_score * 0.3 + length_score * 0.2 +
               relevance_score * 0.3 + confidence_score * 0.2)

    return {
        'grounding_score': round(grounding_score, 3),
        'length_score': round(length_score, 3),
        'relevance_score': round(relevance_score, 3),
        'confidence_score': round(confidence_score, 3),
        'overall_answer_score': round(overall, 3),
        'quality_label': 'Excellent' if overall > 0.75 else 'Good' if overall > 0.6 else 'Fair'
    }


# End-to-End RAG Pipeline

In [14]:
def rag_query(query: str, top_k: int = TOP_K) -> Dict:
    """
    Complete RAG pipeline: retrieve + generate + evaluate.
    Simple minimal output matching assignment example.
    """
    print(f"\n{'='*70}")
    print(f"Query: {query}")
    print(f"{'='*70}")

    # Step 1: Retrieve
    retrieval_result = retrieve_chunks(query, top_k)

    print(f"\n[RETRIEVAL QUALITY]")
    print(f"Score: {retrieval_result['retrieval_quality']['overall_retrieval_score']:.3f}")
    print(f"Label: {retrieval_result['retrieval_quality']['quality_label']}")
    print(f"Movies: {[m['title'] for m in retrieval_result['metadata']]}")

    # Step 2: Generate answer
    generation_result = generate_answer(query, retrieval_result['chunks'])

    print(f"\n[ANSWER]")
    print(f"{generation_result['answer']}")

    print(f"\n[ANSWER QUALITY]")
    print(f"Score: {generation_result['answer_quality']['overall_answer_score']:.3f}")
    print(f"Label: {generation_result['answer_quality']['quality_label']}")

    print(f"\n[REASONING]")
    print(f"{generation_result['reasoning']}")

    # Minimal output format (matching assignment example)
    output = {
        'answer': generation_result['answer'],
        'contexts': retrieval_result['chunks'],
        'reasoning': generation_result['reasoning'],
        '_metrics': {
            'retrieval_score': round(retrieval_result['retrieval_quality']['overall_retrieval_score'], 3),
            'answer_score': round(generation_result['answer_quality']['overall_answer_score'], 3),
            'system_score': round(
                (retrieval_result['retrieval_quality']['overall_retrieval_score'] * 0.4 +
                 generation_result['answer_quality']['overall_answer_score'] * 0.6), 3
            )
        }
    }

    print(f"\n[SYSTEM SCORE]")
    print(f"Overall: {output['_metrics']['system_score']:.3f}")

    return output


# Test Queries

In [None]:
# Example queries
test_queries = [
    "Tell me about a movie with a flying saucer",
]

# Store results
results = []
for query in test_queries:
    result = rag_query(query)
    results.append(result)


# Output Structured JSON

In [None]:
# Save results
output_data = {
    'system_config': {
        'chunk_size': CHUNK_SIZE,
        'top_k': TOP_K,
        'total_movies': len(df),
        'total_chunks': len(chunks_list),
        'embedding_method': 'Sentence Transformers',
        'vector_store': 'FAISS'
    },
    'results': [
        {
            'answer': r['answer'],
            'contexts': r['contexts'],
            'reasoning': r['reasoning'],
            'metrics': r['_metrics']
        }
        for r in results
    ]
}

with open('rag_results.json', 'w') as f:
    json.dump(output_data, f, indent=2)

print("\n✓ Results saved to rag_results.json")

# Display sample output
print("\n" + "="*70)
print("SAMPLE OUTPUT FORMAT:")
print("="*70)
print(json.dumps(output_data['results'][0], indent=2))


# Evaluation Summary

In [None]:
def print_evaluation_summary(results_list: List[Dict]):
    """Print summary statistics of all queries."""
    print("\n" + "="*70)
    print("EVALUATION SUMMARY")
    print("="*70)

    retrieval_scores = [r['_metrics']['retrieval_score'] for r in results_list]
    answer_scores = [r['_metrics']['answer_score'] for r in results_list]
    system_scores = [r['_metrics']['system_score'] for r in results_list]

    print(f"\nRetrieval Quality:")
    print(f"  Avg Score: {np.mean(retrieval_scores):.3f}")
    print(f"  Range: {np.min(retrieval_scores):.3f} - {np.max(retrieval_scores):.3f}")

    print(f"\nAnswer Generation Quality:")
    print(f"  Avg Score: {np.mean(answer_scores):.3f}")
    print(f"  Range: {np.min(answer_scores):.3f} - {np.max(answer_scores):.3f}")

    print(f"\nOverall RAG System:")
    print(f"  Avg Score: {np.mean(system_scores):.3f}")
    print(f"  Range: {np.min(system_scores):.3f} - {np.max(system_scores):.3f}")

    print(f"\nQueries Tested: {len(results_list)}")

print_evaluation_summary(results)


# Interactive Query Function (for notebook testing)

In [18]:
def interactive_rag(query: str) -> None:
    """Simple function to test queries in notebook."""
    result = rag_query(query)
    print("\n" + "="*70)
    print("JSON OUTPUT:")
    print("="*70)
    print(json.dumps(result, indent=2))


In [None]:
interactive_rag("tell me about Alice Chicoy")


In [23]:
def rag_pipeline(query: str):
    results = retrieve_chunks(query)
    answer = generate_answer(query, results["chunks"])
    return {
        "answer": answer['answer'],
        "contexts": results["chunks"],
        "reasoning": answer['reasoning']
    }


In [None]:
query = "Which movie has an AI system named HAL 9000?"
output = rag_pipeline(query)
print(json.dumps(output, indent=2))
