# Data Preparation Notebook

This notebook demonstrates the data preprocessing steps for the CLIR system.

## Steps:
1. Load corpus data
2. Text cleaning and preprocessing
3. Stopword removal
4. Tokenization and lemmatization
5. Save preprocessed data


In [None]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path().resolve().parent))

from src.preprocessing import TextPreprocessor
import pandas as pd


## Load Corpus Data


In [None]:
# Load English corpus
corpus_path = Path("../data/english_corpus.txt")

with open(corpus_path, 'r', encoding='utf-8') as f:
    documents = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(documents)} documents")
print(f"\nSample document:\n{documents[0][:200]}...")


## Preprocessing


In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(language='english')

# Preprocess documents
preprocessed_docs = []
for doc in documents[:10]:  # Process first 10 for demo
    preprocessed = preprocessor.preprocess(doc)
    preprocessed_docs.append(preprocessed)
    print(f"Original: {doc[:100]}...")
    print(f"Preprocessed: {preprocessed[:100]}...")
    print("-" * 80)
