# Model Training Notebook

This notebook demonstrates TF-IDF model training for the CLIR system.

## Steps:
1. Load preprocessed corpus
2. Build TF-IDF vectorizer
3. Train on documents
4. Evaluate model
5. Save model


In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))

from src.retrieval import DocumentRetriever
from src.preprocessing import TextPreprocessor
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt


## Initialize Retriever and Build Index


In [None]:
# Initialize retriever (this will build the TF-IDF index)
retriever = DocumentRetriever(load_existing=False)

print(f"Number of documents: {len(retriever.documents)}")
print(f"TF-IDF matrix shape: {retriever.document_vectors.shape}")
print(f"Number of features: {retriever.document_vectors.shape[1]}")


## Test Retrieval


In [None]:
# Test query
test_query = "Who is the Prime Minister of India?"
results = retriever.retrieve(test_query, top_k=5)

print(f"Query: {test_query}\n")
print("Top 5 Results:")
for i, (doc, score) in enumerate(results, 1):
    print(f"{i}. Score: {score:.3f}")
    print(f"   {doc[:150]}...")
    print()
