# TF-IDF with N-gram Tokenization and Model Training

This notebook demonstrates:
1. Manual implementation of TF-IDF with **N-gram tokenization** (from scratch)
2. Adjustable n-gram range parameter (default: unigrams + bigrams)
3. Comparison with unigram-only approach
4. Training machine learning models for text classification
5. Model evaluation with comprehensive metrics

**Key Enhancement:** N-gram tokenization captures word sequences (e.g., "இலங்கை அரசு") as features

**Dataset:** Tamil news articles with categories and processed text

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import math
import pickle
import os
from collections import Counter, defaultdict
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")
print("N-gram tokenization enabled ✓")

## 2. Load the Cleaned Data

In [None]:
# Load the processed data from previous notebook
df = pd.read_csv('output/processed_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check data distribution
print("Category distribution:")
print(df['category'].value_counts())
print(f"\nTotal samples: {len(df)}")
print(f"Missing values:\n{df.isnull().sum()}")

## 3. Prepare Text Data for TF-IDF

We'll use the `cleaned_title` column which contains preprocessed Tamil text.

In [None]:
# Select the text column and target variable
documents = df['cleaned_title'].fillna('').tolist()
labels = df['category'].tolist()

print(f"Total documents: {len(documents)}")
print(f"Total labels: {len(labels)}")
print(f"\nSample document: {documents[0]}")
print(f"Sample label: {labels[0]}")

## 4. TF-IDF Implementation from Scratch with N-grams

### Step 1: N-gram Tokenization

**Key Enhancement:** Instead of splitting only into single words (unigrams), we now generate word n-grams based on `ngram_range`.

- **Unigrams (1-gram):** ["இலங்கை", "அரசு", "தீர்மானம்"]
- **Bigrams (2-gram):** ["இலங்கை அரசு", "அரசு தீர்மானம்"]
- **ngram_range=(1,2):** Both unigrams and bigrams combined

This captures context and word sequences as features.

In [None]:
def generate_ngrams(tokens, ngram_range=(1, 2)):
    """
    Generate n-grams from a list of tokens.
    
    Args:
        tokens: List of words (tokens)
        ngram_range: Tuple (min_n, max_n) specifying the range of n-grams
    
    Returns:
        List of n-gram strings
    """
    ngrams = []
    min_n, max_n = ngram_range
    
    for n in range(min_n, max_n + 1):
        for i in range(len(tokens) - n + 1):
            ngram = ' '.join(tokens[i:i+n])
            ngrams.append(ngram)
    
    return ngrams

def tokenize_with_ngrams(text, ngram_range=(1, 2)):
    """
    Tokenize text and generate n-grams.
    """
    tokens = text.split()
    return generate_ngrams(tokens, ngram_range)

# Example of n-gram tokenization
sample_text = documents[0] if documents[0] else "இலங்கை அரசு தீர்மானம்"
sample_tokens = sample_text.split()

print("Original tokens (unigrams only):")
print(sample_tokens[:10])

print("\nN-grams with ngram_range=(1, 2):")
ngrams_sample = generate_ngrams(sample_tokens, ngram_range=(1, 2))
print(ngrams_sample[:15])

In [None]:
# Configurable n-gram range parameter
NGRAM_RANGE = (1, 2)  # Default: unigrams + bigrams

# Apply n-gram tokenization to all documents
tokenized_docs = [tokenize_with_ngrams(doc, ngram_range=NGRAM_RANGE) for doc in documents]
print(f"Tokenized {len(tokenized_docs)} documents with ngram_range={NGRAM_RANGE}")
print(f"Sample tokenized document length: {len(tokenized_docs[0])} n-grams")

### Step 2: Build Vocabulary with N-grams

Create a vocabulary including all unique n-grams in the corpus.

In [None]:
vocabulary = set()
for doc in tokenized_docs:
    vocabulary.update(doc)

vocabulary = sorted(list(vocabulary))
word2idx = {word: idx for idx, word in enumerate(vocabulary)}

print(f"Vocabulary size with ngram_range={NGRAM_RANGE}: {len(vocabulary)} unique n-grams")
print(f"\nSample vocabulary entries:")
print(vocabulary[:10])

### Step 3: Compute Term Frequency (TF) for N-grams

**Term Frequency (TF)** measures how frequently an n-gram appears in a document.

Formula: `TF(t, d) = (Number of times n-gram t appears in document d) / (Total number of n-grams in document d)`

In [None]:
def compute_tf(tokenized_doc):
    """Compute term frequency for n-grams: TF(t) = count(t) / total_ngrams"""
    tf_dict = {}
    doc_length = len(tokenized_doc)
    if doc_length == 0:
        return tf_dict
    term_counts = Counter(tokenized_doc)
    for term, count in term_counts.items():
        tf_dict[term] = count / doc_length
    return tf_dict

tf_docs = [compute_tf(doc) for doc in tokenized_docs]
print("Term Frequency computed for n-grams")
print(f"Sample TF dict entries: {list(tf_docs[0].items())[:5]}")

### Step 4: Compute Inverse Document Frequency (IDF) for N-grams

**Inverse Document Frequency (IDF)** measures how important an n-gram is across the entire corpus.

Formula: `IDF(t) = log(Total number of documents / Number of documents containing n-gram t)`

In [None]:
def compute_idf(tokenized_docs, vocabulary):
    """Compute IDF for n-grams: IDF(t) = log(N / df(t))"""
    N = len(tokenized_docs)
    idf_dict = {}
    for word in vocabulary:
        doc_count = sum(1 for doc in tokenized_docs if word in doc)
        idf_dict[word] = math.log(N / (doc_count + 1))
    return idf_dict

print("Computing IDF for n-grams...")
idf_dict = compute_idf(tokenized_docs, vocabulary)
print(f"IDF computed for {len(idf_dict)} n-grams")
print(f"Sample IDF values: {list(idf_dict.items())[:5]}")

### Step 5: Compute TF-IDF Weights for N-grams

**TF-IDF** combines both TF and IDF to get the final weight for each n-gram in each document.

Formula: `TF-IDF(t, d) = TF(t, d) × IDF(t)`

In [None]:
def compute_tfidf(tf_dict, idf_dict):
    """Compute TF-IDF for n-grams: TF-IDF(t, d) = TF(t, d) × IDF(t)"""
    tfidf_dict = {}
    for term, tf_value in tf_dict.items():
        tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
    return tfidf_dict

tfidf_docs = [compute_tfidf(tf, idf_dict) for tf in tf_docs]
print("TF-IDF weights computed for n-grams")

### Step 6: Create TF-IDF Matrix with N-grams

Convert the TF-IDF dictionaries into a matrix representation where:
- Rows represent documents
- Columns represent n-grams in vocabulary
- Values are TF-IDF weights

In [None]:
def create_tfidf_matrix(tfidf_docs, vocabulary, word2idx):
    """Create TF-IDF matrix: (n_documents, n_vocabulary)"""
    n_docs = len(tfidf_docs)
    n_vocab = len(vocabulary)
    tfidf_matrix = np.zeros((n_docs, n_vocab))
    
    for doc_idx, tfidf_dict in enumerate(tfidf_docs):
        for term, tfidf_value in tfidf_dict.items():
            if term in word2idx:
                term_idx = word2idx[term]
                tfidf_matrix[doc_idx, term_idx] = tfidf_value
    
    return tfidf_matrix

print("Creating TF-IDF matrix with n-grams...")
tfidf_matrix_custom = create_tfidf_matrix(tfidf_docs, vocabulary, word2idx)
print(f"TF-IDF Matrix: {tfidf_matrix_custom.shape}")
print(f"  - Documents: {tfidf_matrix_custom.shape[0]}")
print(f"  - N-gram features: {tfidf_matrix_custom.shape[1]}")

In [None]:
plt.figure(figsize=(12, 6))
plt.imshow(tfidf_matrix_custom[:10, :20], aspect='auto', cmap='YlOrRd')
plt.colorbar(label='TF-IDF Weight')
plt.xlabel('N-gram Feature Index')
plt.ylabel('Document Index')
plt.title(f'TF-IDF Matrix Visualization (with N-grams, range={NGRAM_RANGE})')
plt.tight_layout()
plt.show()

## 5. TF-IDF Matrix Statistics

In [None]:
# Display TF-IDF matrix statistics
print(f"TF-IDF Matrix Statistics (with N-grams {NGRAM_RANGE}):")
print(f"  Shape: {tfidf_matrix_custom.shape}")
print(f"  Mean: {tfidf_matrix_custom.mean():.6f}")
print(f"  Max: {tfidf_matrix_custom.max():.6f}")
print(f"  Sparsity: {(tfidf_matrix_custom == 0).sum() / tfidf_matrix_custom.size * 100:.2f}%")
print(f"\nNote: N-gram features = {tfidf_matrix_custom.shape[1]}")

## 6. Compare Unigrams vs N-grams (Feature Count)

Before training models, let's compare the vocabulary size difference between unigrams and n-grams.