In [None]:
# --- NLTK setup (must be first cell in every notebook) ---
import nltk

NLTK_DATA_DIR = "/ml-envs/kharagpur_env/nltk_data"
if NLTK_DATA_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DATA_DIR)

# =========================
# Imports
# =========================
import pandas as pd
import numpy as np

import pathway as pw
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# =========================
# Paths
# =========================
TRAIN_PATH = "/media/deepaansh-sial/c7e4e807-62df-49d6-a165-9797ceac5277/IIT Kharagpur Data Science Hackathon/train.csv"
TEST_PATH  = "/media/deepaansh-sial/c7e4e807-62df-49d6-a165-9797ceac5277/IIT Kharagpur Data Science Hackathon/test.csv"

BOOK_FILES = {
    "In Search of the Castaways": "/media/deepaansh-sial/c7e4e807-62df-49d6-a165-9797ceac5277/IIT Kharagpur Data Science Hackathon/Books/In search of the castaways.txt",
    "The Count of Monte Cristo": "/media/deepaansh-sial/c7e4e807-62df-49d6-a165-9797ceac5277/IIT Kharagpur Data Science Hackathon/Books/The Count of Monte Cristo.txt",
}

TOP_K_EVIDENCE = 2  # LOCKED

# =========================
# Load data
# =========================
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
train_df["label"] = train_df["label"].str.lower()

print("Train:", train_df.shape, "Test:", test_df.shape)

# =========================
# Book loading & chunking
# =========================
from nltk.tokenize import sent_tokenize

def load_book_text(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().replace("\r", " ")

def chunk_text(text, max_tokens=700, overlap_tokens=100):
    sentences = sent_tokenize(text)
    chunks, current, tokens = [], [], 0

    for s in sentences:
        s_tokens = int(len(s.split()) * 1.3)
        if tokens + s_tokens > max_tokens:
            chunks.append(" ".join(current))
            overlap, used = [], 0
            for sent in reversed(current):
                t = int(len(sent.split()) * 1.3)
                overlap.insert(0, sent)
                used += t
                if used >= overlap_tokens:
                    break
            current, tokens = overlap, used

        current.append(s)
        tokens += s_tokens

    if current:
        chunks.append(" ".join(current))
    return chunks

book_chunks = {}
for book, path in BOOK_FILES.items():
    text = load_book_text(path)
    chunks = chunk_text(text)
    book_chunks[book] = [
        {"book": book, "chunk_id": i, "text": c}
        for i, c in enumerate(chunks)
    ]
    print(book, "chunks:", len(chunks))

# =========================
# Embeddings + Pathway table
# =========================
embedder = SentenceTransformer("all-MiniLM-L6-v2")

@pw.udf
def embed_text(text: str):
    return embedder.encode(text).tolist()

def build_pathway_table(book_chunks):
    records = []
    for book, chunks in book_chunks.items():
        for c in chunks:
            records.append({
                "book": c["book"],
                "chunk_id": c["chunk_id"],
                "text": c["text"]
            })
    return pw.debug.table_from_pandas(pd.DataFrame(records))

novel_table = build_pathway_table(book_chunks)

novel_table = novel_table.select(
    book=pw.this.book,
    chunk_id=pw.this.chunk_id,
    text=pw.this.text,
    embedding=embed_text(pw.this.text)
)

novel_df = pw.debug.table_to_pandas(novel_table)
novel_df["embedding"] = novel_df["embedding"].apply(np.array)

# =========================
# Retrieval
# =========================
def retrieve_evidence(book_name, queries):
    book_df = novel_df[novel_df["book"] == book_name].reset_index(drop=True)
    if len(book_df) == 0:
        return [{"evidence": []} for _ in queries]

    book_embeddings = np.vstack(book_df["embedding"].values)
    knn = NearestNeighbors(
        n_neighbors=min(TOP_K_EVIDENCE, len(book_df)),
        metric="cosine"
    )
    knn.fit(book_embeddings)

    results = []
    for q in queries:
        q_emb = embedder.encode(q).reshape(1, -1)
        _, idxs = knn.kneighbors(q_emb)
        evidence = [{"text": book_df.iloc[i]["text"]} for i in idxs[0]]
        results.append({"evidence": evidence})

    return results

# =========================
# Feature extraction (FINAL)
# =========================
def extract_features(backstory, book_name):
    evidence = retrieve_evidence(book_name, [backstory])[0]["evidence"]

    back_emb = embedder.encode(backstory).reshape(1, -1)
    ev_embs = np.vstack([embedder.encode(ev["text"]) for ev in evidence])
    sims = cosine_similarity(back_emb, ev_embs)[0]

    book_df = novel_df[novel_df["book"] == book_name]
    book_embs = np.vstack(book_df["embedding"].values)
    book_sims = cosine_similarity(back_emb, book_embs)[0]

    return {
        "max_sim": float(np.max(sims)),
        "mean_sim": float(np.mean(sims)),
        "sim_percentile": float((book_sims < np.max(sims)).mean()),
        "sim_zscore": float(
            (np.max(sims) - book_sims.mean()) / (book_sims.std() + 1e-6)
        ),
        "backstory_len": len(backstory.split()),
    }

# =========================
# Build feature dataset
# =========================
feature_rows = []
for _, row in train_df.iterrows():
    feats = extract_features(row["content"], row["book_name"])
    feats["label"] = 1 if row["label"] == "consistent" else 0
    feature_rows.append(feats)

feature_df = pd.DataFrame(feature_rows)

# =========================
# Train / validation
# =========================
X = feature_df.drop(columns=["label"])
y = feature_df["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    class_weight="balanced"
)
clf.fit(X_train, y_train)

print("Validation Accuracy:", accuracy_score(y_val, clf.predict(X_val)))

# =========================
# Final training + submission
# =========================
clf.fit(X, y)

test_features = []
for _, row in test_df.iterrows():
    test_features.append(
        extract_features(row["content"], row["book_name"])
    )

X_test = pd.DataFrame(test_features)
test_preds = clf.predict(X_test)

test_df["label"] = np.where(test_preds == 1, "consistent", "contradict")
test_df[["id", "label"]].to_csv("result.csv", index=False)

print("result.csv generated âœ…")
