# TF-IDF with N-gram Tokenization and Model Training

This notebook demonstrates:
1. Manual implementation of TF-IDF with **N-gram tokenization** (from scratch)
2. Adjustable n-gram range parameter (default: unigrams + bigrams)
3. Comparison with unigram-only approach
4. Training machine learning models for text classification
5. Model evaluation with comprehensive metrics

**Key Enhancement:** N-gram tokenization captures word sequences (e.g., "роЗро▓роЩрпНроХрпИ роЕро░роЪрпБ") as features

**Dataset:** Tamil news articles with categories and processed text

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import pickle
import os
from collections import Counter, defaultdict
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")
print("N-gram tokenization enabled тЬУ")

Libraries imported successfully!
N-gram tokenization enabled тЬУ


## 2. Load the Cleaned Data

In [2]:
# Load the processed data from previous notebook
df = pd.read_csv('output/processed_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (18447, 5)

Columns: ['category', 'processed_title', 'cleaned_title', 'tokenized_title', 'title']

First few rows:


Unnamed: 0,category,processed_title,cleaned_title,tokenized_title,title
0,tamilnadu,роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН родрооро┐ро┤роХ роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпН роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН,роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН родрооро┐ро┤роХ роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН,роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН родрооро┐ро┤роХ роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН,"роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН: родрооро┐ро┤роХ, роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН"
1,sports,рокроирпНродрпБро╡рпАроЪрпНроЪро╛ро│ро░рпН роРрокро┐роОро▓рпН ро╡ро┐ро│рпИропро╛роЯро▓ро╛рооро╛ роОродро┐ро░рпЖродро┐ро░рпН роХро░рпБродрпНродро┐ро▓рпН родрпЛройро┐ роХрпБроорпНрокро│рпЗ,рокроирпНродрпБро╡рпАроЪрпНроЪро╛ро│ро░рпНроХро│рпН роРрокро┐роОро▓рпН ро╡ро┐ро│рпИропро╛роЯро▓ро╛рооро╛ роОродро┐ро░рпЖродро┐ро░рпН роХро░рпБродрпНродро┐ро▓рпН родрпЛройро┐ роХрпБроорпНрокро│рпЗ,рокроирпНродрпБро╡рпАроЪрпНроЪро╛ро│ро░рпНроХро│рпН роРрокро┐роОро▓рпН ро╡ро┐ро│рпИропро╛роЯро▓ро╛рооро╛ роОродро┐ро░рпЖродро┐ро░рпН роХро░рпБродрпНродро┐ро▓рпН родрпЛройро┐ роХрпБроорпНрокро│рпЗ,рокроирпНродрпБро╡рпАроЪрпНроЪро╛ро│ро░рпНроХро│рпН роРрокро┐роОро▓рпН ро╡ро┐ро│рпИропро╛роЯро▓ро╛рооро╛? - роОродро┐ро░рпЖродро┐ро░рпН роХро░рпБродрпНродро┐ро▓рпН родрпЛройро┐-роХрпБроорпНрокро│рпЗ
2,tamilnadu,роХройрооро┤рпИ роОроЪрпНроЪро░ро┐роХрпНроХрпИ роиро╛ро│рпИ рокро│рпНро│ро┐ роХро▓рпНро▓рпВро░ро┐ ро╡ро┐роЯрпБроорпБро▒рпИ роОроЩрпНроХрпЖро▓рпНро▓ро╛роорпН родрпЖро░ро┐ропрпБрооро╛,роХройрооро┤рпИ роОроЪрпНроЪро░ро┐роХрпНроХрпИ роиро╛ро│рпИ рокро│рпНро│ро┐ роХро▓рпНро▓рпВро░ро┐роХро│рпБроХрпНроХрпБ ро╡ро┐роЯрпБроорпБро▒рпИ роОроЩрпНроХрпЖро▓рпНро▓ро╛роорпН родрпЖро░ро┐ропрпБрооро╛,роХройрооро┤рпИ роОроЪрпНроЪро░ро┐роХрпНроХрпИ роиро╛ро│рпИ рокро│рпНро│ро┐ роХро▓рпНро▓рпВро░ро┐роХро│рпБроХрпНроХрпБ ро╡ро┐роЯрпБроорпБро▒рпИ роОроЩрпНроХрпЖро▓рпНро▓ро╛роорпН родрпЖро░ро┐ропрпБрооро╛,"роХройрооро┤рпИ роОроЪрпНроЪро░ро┐роХрпНроХрпИ | роиро╛ро│рпИ рокро│рпНро│ро┐, роХро▓рпНро▓рпВро░ро┐роХро│рпБроХрпНроХрпБ ро╡ро┐роЯрпБроорпБро▒рпИ.. роОроЩрпНроХрпЖро▓рпНро▓ро╛роорпН родрпЖро░ро┐ропрпБрооро╛?"
3,tamilnadu,родро╡рпЖроХро╡рпИ роЖрогрпНроЯро╡ройро╛ро▓рпБроорпН роХро╛рокрпНрокро╛ро▒рпНро▒ роорпБроЯро┐ропро╛родрпБ ро╡ро┐роЬропрпН роЖро░рпНрокро┐ роЙродропроХрпБрооро╛ро░рпН роЕроЯрпНро╡рпИро╕рпН,родро╡рпЖроХро╡рпИ роЖрогрпНроЯро╡ройро╛ро▓рпБроорпН роХро╛рокрпНрокро╛ро▒рпНро▒ роорпБроЯро┐ропро╛родрпБ ро╡ро┐роЬропрпНроХрпНроХрпБ роЖро░рпНрокро┐ роЙродропроХрпБрооро╛ро░рпН роЕроЯрпНро╡рпИро╕рпН,родро╡рпЖроХро╡рпИ роЖрогрпНроЯро╡ройро╛ро▓рпБроорпН роХро╛рокрпНрокро╛ро▒рпНро▒ роорпБроЯро┐ропро╛родрпБ ро╡ро┐роЬропрпНроХрпНроХрпБ роЖро░рпНрокро┐ роЙродропроХрпБрооро╛ро░рпН роЕроЯрпНро╡рпИро╕рпН,родро╡рпЖроХро╡рпИ роЖрогрпНроЯро╡ройро╛ро▓рпБроорпН роХро╛рокрпНрокро╛ро▒рпНро▒ роорпБроЯро┐ропро╛родрпБ - ро╡ро┐роЬропрпНроХрпНроХрпБ роЖро░рпНрокро┐ роЙродропроХрпБрооро╛ро░рпН роЕроЯрпНро╡рпИро╕рпН
4,tamilnadu,роЖро┤рпН роХро╛ро▒рпНро▒ро┤рпБ родро╛ро┤рпНро╡рпБрокрпНрокроХрпБродро┐ роОродро┐ро░рпКро▓ро┐ рооро╛ро╡роЯрпНроЯроЩрпН роЕро▓ро░рпНроЯрпН,роЖро┤рпНроирпНрод роХро╛ро▒рпНро▒ро┤рпБродрпНрод родро╛ро┤рпНро╡рпБрокрпНрокроХрпБродро┐ роОродро┐ро░рпКро▓ро┐ рооро╛ро╡роЯрпНроЯроЩрпНроХро│рпБроХрпНроХрпБ роЕро▓ро░рпНроЯрпН,роЖро┤рпНроирпНрод роХро╛ро▒рпНро▒ро┤рпБродрпНрод родро╛ро┤рпНро╡рпБрокрпНрокроХрпБродро┐ роОродро┐ро░рпКро▓ро┐ рооро╛ро╡роЯрпНроЯроЩрпНроХро│рпБроХрпНроХрпБ роЕро▓ро░рпНроЯрпН,роЖро┤рпНроирпНрод роХро╛ро▒рпНро▒ро┤рпБродрпНрод родро╛ро┤рпНро╡рпБрокрпНрокроХрпБродро┐ роОродро┐ро░рпКро▓ро┐.. 8 рооро╛ро╡роЯрпНроЯроЩрпНроХро│рпБроХрпНроХрпБ RED роЕро▓ро░рпНроЯрпН..


In [3]:
# Check data distribution
print("Category distribution:")
print(df['category'].value_counts())
print(f"\nTotal samples: {len(df)}")
print(f"Missing values:\n{df.isnull().sum()}")

Category distribution:
category
india                                     2191
world                                     1893
cinema                                    1811
sports                                    1748
crime                                     1627
tamilnadu                                 1589
business                                  1304
trending                                  1271
technology                                1225
features                                  1196
health                                     841
environment                                639
agriculture                                613
spiritual                                  245
lifestyle                                  103
motor                                       45
coronavirus                                 38
ampstories                                  30
women                                       20
employment-news-in-tamil-latest-update      18
Name: count, dtype: int64

T

## 3. Prepare Text Data for TF-IDF

We'll use the `cleaned_title` column which contains preprocessed Tamil text.

In [4]:
# Select the text column and target variable
documents = df['cleaned_title'].fillna('').tolist()
labels = df['category'].tolist()

print(f"Total documents: {len(documents)}")
print(f"Total labels: {len(labels)}")
print(f"\nSample document: {documents[0]}")
print(f"Sample label: {labels[0]}")

Total documents: 18447
Total labels: 18447

Sample document: роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН родрооро┐ро┤роХ роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН
Sample label: tamilnadu


## 4. TF-IDF Implementation from Scratch with N-grams

### Step 1: N-gram Tokenization

**Key Enhancement:** Instead of splitting only into single words (unigrams), we now generate word n-grams based on `ngram_range`.

- **Unigrams (1-gram):** ["роЗро▓роЩрпНроХрпИ", "роЕро░роЪрпБ", "родрпАро░рпНрооро╛ройроорпН"]
- **Bigrams (2-gram):** ["роЗро▓роЩрпНроХрпИ роЕро░роЪрпБ", "роЕро░роЪрпБ родрпАро░рпНрооро╛ройроорпН"]
- **ngram_range=(1,2):** Both unigrams and bigrams combined

This captures context and word sequences as features.

In [6]:
def generate_ngrams(tokens, ngram_range=(1, 2)):
    """
    Generate n-grams from a list of tokens.
    
    Args:
        tokens: List of words (tokens)
        ngram_range: Tuple (min_n, max_n) specifying the range of n-grams
    
    Returns:
        List of n-gram strings
    """
    ngrams = []
    min_n, max_n = ngram_range
    
    for n in range(min_n, max_n + 1):
        for i in range(len(tokens) - n + 1):
            ngram = ' '.join(tokens[i:i+n])
            ngrams.append(ngram)
    
    return ngrams

def tokenize_with_ngrams(text, ngram_range=(1, 2)):
    """
    Tokenize text and generate n-grams.
    """
    tokens = text.split()
    return generate_ngrams(tokens, ngram_range)

# Example of n-gram tokenization
sample_text = documents[0] if documents[0] else "роЗро▓роЩрпНроХрпИ роЕро░роЪрпБ родрпАро░рпНрооро╛ройроорпН"
sample_tokens = sample_text.split()

print("Original tokens (unigrams only):")
print(sample_tokens[:10])

print("\nN-grams with ngram_range=(1, 2):")
ngrams_sample = generate_ngrams(sample_tokens, ngram_range=(1, 2))
print(ngrams_sample[:15])

Original tokens (unigrams only):
['роорпЗроХродро╛родрпБ', 'ро╡ро┐ро╡роХро╛ро░роорпН', 'родрооро┐ро┤роХ', 'роХро░рпНроиро╛роЯроХро╛', 'роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ', 'роиро┐родро┐ройрпН', 'роХроЯрпНроХро░ро┐', 'роХроЯро┐родроорпН']

N-grams with ngram_range=(1, 2):
['роорпЗроХродро╛родрпБ', 'ро╡ро┐ро╡роХро╛ро░роорпН', 'родрооро┐ро┤роХ', 'роХро░рпНроиро╛роЯроХро╛', 'роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ', 'роиро┐родро┐ройрпН', 'роХроЯрпНроХро░ро┐', 'роХроЯро┐родроорпН', 'роорпЗроХродро╛родрпБ ро╡ро┐ро╡роХро╛ро░роорпН', 'ро╡ро┐ро╡роХро╛ро░роорпН родрооро┐ро┤роХ', 'родрооро┐ро┤роХ роХро░рпНроиро╛роЯроХро╛', 'роХро░рпНроиро╛роЯроХро╛ роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ', 'роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ роиро┐родро┐ройрпН', 'роиро┐родро┐ройрпН роХроЯрпНроХро░ро┐', 'роХроЯрпНроХро░ро┐ роХроЯро┐родроорпН']


In [7]:
# Configurable n-gram range parameter
NGRAM_RANGE = (1, 2)  # Default: unigrams + bigrams

# Apply n-gram tokenization to all documents
tokenized_docs = [tokenize_with_ngrams(doc, ngram_range=NGRAM_RANGE) for doc in documents]
print(f"Tokenized {len(tokenized_docs)} documents with ngram_range={NGRAM_RANGE}")
print(f"Sample tokenized document length: {len(tokenized_docs[0])} n-grams")

Tokenized 18447 documents with ngram_range=(1, 2)
Sample tokenized document length: 15 n-grams


### Step 2: Build Vocabulary with N-grams

Create a vocabulary including all unique n-grams in the corpus.

In [8]:
vocabulary = set()
for doc in tokenized_docs:
    vocabulary.update(doc)

vocabulary = sorted(list(vocabulary))
word2idx = {word: idx for idx, word in enumerate(vocabulary)}

print(f"Vocabulary size with ngram_range={NGRAM_RANGE}: {len(vocabulary)} unique n-grams")
print(f"\nSample vocabulary entries:")
print(vocabulary[:10])

Vocabulary size with ngram_range=(1, 2): 144438 unique n-grams

Sample vocabulary entries:
['┬░', '┬░ роЪрпВро░рпНропроХрпБрооро╛ро░рпН', '├▒', '├▒ роОройрпНрокродрпБ', 'роГрокроХродрпН', 'роГрокроХродрпН роГрокро╛роЪро┐ро▓рпИ', 'роГрокроХродрпН роГрокро╛роЪро┐ро▓рпН', 'роГрокроХро╛ро░рпН', 'роГрокроХро╛ро░рпН роЬрооро╛ройрпН', 'роГрокроЯрпНройро╛ро╡ро┐ро╕рпН']


### Step 3: Compute Term Frequency (TF) for N-grams

**Term Frequency (TF)** measures how frequently an n-gram appears in a document.

Formula: `TF(t, d) = (Number of times n-gram t appears in document d) / (Total number of n-grams in document d)`

In [9]:
def compute_tf(tokenized_doc):
    """Compute term frequency for n-grams: TF(t) = count(t) / total_ngrams"""
    tf_dict = {}
    doc_length = len(tokenized_doc)
    if doc_length == 0:
        return tf_dict
    term_counts = Counter(tokenized_doc)
    for term, count in term_counts.items():
        tf_dict[term] = count / doc_length
    return tf_dict

tf_docs = [compute_tf(doc) for doc in tokenized_docs]
print("Term Frequency computed for n-grams")
print(f"Sample TF dict entries: {list(tf_docs[0].items())[:5]}")

Term Frequency computed for n-grams
Sample TF dict entries: [('роорпЗроХродро╛родрпБ', 0.06666666666666667), ('ро╡ро┐ро╡роХро╛ро░роорпН', 0.06666666666666667), ('родрооро┐ро┤роХ', 0.06666666666666667), ('роХро░рпНроиро╛роЯроХро╛', 0.06666666666666667), ('роорпБродро▓роорпИроЪрпНроЪро░рпНроХро│рпБроХрпНроХрпБ', 0.06666666666666667)]


### Step 4: Compute Inverse Document Frequency (IDF) for N-grams

**Inverse Document Frequency (IDF)** measures how important an n-gram is across the entire corpus.

Formula: `IDF(t) = log(Total number of documents / Number of documents containing n-gram t)`

In [10]:
def compute_idf(tokenized_docs, vocabulary):
    """Compute IDF for n-grams: IDF(t) = log(N / df(t))"""
    N = len(tokenized_docs)
    idf_dict = {}
    for word in vocabulary:
        doc_count = sum(1 for doc in tokenized_docs if word in doc)
        idf_dict[word] = math.log(N / (doc_count + 1))
    return idf_dict

print("Computing IDF for n-grams...")
idf_dict = compute_idf(tokenized_docs, vocabulary)
print(f"IDF computed for {len(idf_dict)} n-grams")
print(f"Sample IDF values: {list(idf_dict.items())[:5]}")

Computing IDF for n-grams...
IDF computed for 144438 n-grams
Sample IDF values: [('┬░', 9.129509854061634), ('┬░ роЪрпВро░рпНропроХрпБрооро╛ро░рпН', 9.129509854061634), ('├▒', 9.129509854061634), ('├▒ роОройрпНрокродрпБ', 9.129509854061634), ('роГрокроХродрпН', 8.436362673501689)]
IDF computed for 144438 n-grams
Sample IDF values: [('┬░', 9.129509854061634), ('┬░ роЪрпВро░рпНропроХрпБрооро╛ро░рпН', 9.129509854061634), ('├▒', 9.129509854061634), ('├▒ роОройрпНрокродрпБ', 9.129509854061634), ('роГрокроХродрпН', 8.436362673501689)]


### Step 5: Compute TF-IDF Weights for N-grams

**TF-IDF** combines both TF and IDF to get the final weight for each n-gram in each document.

Formula: `TF-IDF(t, d) = TF(t, d) ├Ч IDF(t)`

In [11]:
def compute_tfidf(tf_dict, idf_dict):
    """Compute TF-IDF for n-grams: TF-IDF(t, d) = TF(t, d) ├Ч IDF(t)"""
    tfidf_dict = {}
    for term, tf_value in tf_dict.items():
        tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
    return tfidf_dict

tfidf_docs = [compute_tfidf(tf, idf_dict) for tf in tf_docs]
print("TF-IDF weights computed for n-grams")

TF-IDF weights computed for n-grams


### Step 6: Create TF-IDF Matrix with N-grams

Convert the TF-IDF dictionaries into a matrix representation where:
- Rows represent documents
- Columns represent n-grams in vocabulary
- Values are TF-IDF weights

In [12]:
def create_tfidf_matrix(tfidf_docs, vocabulary, word2idx):
    """Create TF-IDF matrix: (n_documents, n_vocabulary)"""
    n_docs = len(tfidf_docs)
    n_vocab = len(vocabulary)
    tfidf_matrix = np.zeros((n_docs, n_vocab))
    
    for doc_idx, tfidf_dict in enumerate(tfidf_docs):
        for term, tfidf_value in tfidf_dict.items():
            if term in word2idx:
                term_idx = word2idx[term]
                tfidf_matrix[doc_idx, term_idx] = tfidf_value
    
    return tfidf_matrix

print("Creating TF-IDF matrix with n-grams...")
tfidf_matrix_custom = create_tfidf_matrix(tfidf_docs, vocabulary, word2idx)
print(f"TF-IDF Matrix: {tfidf_matrix_custom.shape}")
print(f"  - Documents: {tfidf_matrix_custom.shape[0]}")
print(f"  - N-gram features: {tfidf_matrix_custom.shape[1]}")

Creating TF-IDF matrix with n-grams...


MemoryError: Unable to allocate 19.9 GiB for an array with shape (18447, 144438) and data type float64

In [14]:
plt.figure(figsize=(12, 6))
plt.imshow(tfidf_matrix_custom[:10, :20], aspect='auto', cmap='YlOrRd')
plt.colorbar(label='TF-IDF Weight')
plt.xlabel('N-gram Feature Index')
plt.ylabel('Document Index')
plt.title(f'TF-IDF Matrix Visualization (with N-grams, range={NGRAM_RANGE})')
plt.tight_layout()
plt.show()

NameError: name 'tfidf_matrix_custom' is not defined

<Figure size 1200x600 with 0 Axes>

## 5. TF-IDF Matrix Statistics

In [None]:
# Display TF-IDF matrix statistics
print(f"TF-IDF Matrix Statistics (with N-grams {NGRAM_RANGE}):")
print(f"  Shape: {tfidf_matrix_custom.shape}")
print(f"  Mean: {tfidf_matrix_custom.mean():.6f}")
print(f"  Max: {tfidf_matrix_custom.max():.6f}")
print(f"  Sparsity: {(tfidf_matrix_custom == 0).sum() / tfidf_matrix_custom.size * 100:.2f}%")
print(f"\nNote: N-gram features = {tfidf_matrix_custom.shape[1]}")

## 6. Compare Unigrams vs N-grams (Feature Count)

Before training models, let's compare the vocabulary size difference between unigrams and n-grams.

In [None]:
# Compute unigram-only vocabulary for comparison
tokenized_unigrams = [tokenize_with_ngrams(doc, ngram_range=(1, 1)) for doc in documents]
vocab_unigrams = set()
for doc in tokenized_unigrams:
    vocab_unigrams.update(doc)

print("VOCABULARY SIZE COMPARISON:")
print("="*60)
print(f"Unigrams only (1,1):      {len(vocab_unigrams):,} features")
print(f"N-grams {NGRAM_RANGE}:         {len(vocabulary):,} features")
print(f"Increase:                 {len(vocabulary) - len(vocab_unigrams):,} features")
print(f"Percentage increase:      {((len(vocabulary) / len(vocab_unigrams)) - 1) * 100:.2f}%")
print("="*60)

## 7. Train Machine Learning Models

Training three classification models:
1. **Naive Bayes** - Probabilistic model
2. **Linear SVM** - Support Vector Machine with linear kernel
3. **Logistic Regression** - Linear model with multinomial classification

### 7.1 Naive Bayes

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_train_pred = nb_model.predict(X_train)
nb_test_pred = nb_model.predict(X_test)
print("Naive Bayes training completed with n-gram features")

### 7.2 Linear SVM

In [None]:
svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)
svm_model.fit(X_train, y_train)
svm_train_pred = svm_model.predict(X_train)
svm_test_pred = svm_model.predict(X_test)
print("Linear SVM training completed with n-gram features")

### 7.3 Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
lr_model.fit(X_train, y_train)
lr_train_pred = lr_model.predict(X_train)
lr_test_pred = lr_model.predict(X_test)
print("Logistic Regression training completed with n-gram features")

## 8. Model Evaluation

Evaluate all models using multiple metrics:
- **Accuracy**: Overall correctness
- **Precision**: How many selected items are relevant
- **Recall**: How many relevant items are selected
- **F1-Score**: Harmonic mean of precision and recall
- **Confusion Matrix**: Detailed breakdown of predictions

In [None]:
def evaluate_model(y_true, y_pred, model_name, dataset_name):
    """
    Evaluate model performance with multiple metrics.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n{'='*60}")
    print(f"{model_name} - {dataset_name} Set")
    print(f"{'='*60}")
    print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
    print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
    print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

### 8.1 Evaluate All Models

In [None]:
# Evaluate Naive Bayes
nb_train_metrics = evaluate_model(y_train, nb_train_pred, "Naive Bayes (N-gram)", "Training")
nb_test_metrics = evaluate_model(y_test, nb_test_pred, "Naive Bayes (N-gram)", "Test")

# Evaluate Linear SVM
svm_train_metrics = evaluate_model(y_train, svm_train_pred, "Linear SVM (N-gram)", "Training")
svm_test_metrics = evaluate_model(y_test, svm_test_pred, "Linear SVM (N-gram)", "Test")

# Evaluate Logistic Regression
lr_train_metrics = evaluate_model(y_train, lr_train_pred, "Logistic Regression (N-gram)", "Training")
lr_test_metrics = evaluate_model(y_test, lr_test_pred, "Logistic Regression (N-gram)", "Test")

### 8.2 Confusion Matrices

In [None]:
classes = sorted(list(set(y_test)))

# Create confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Naive Bayes
nb_cm = confusion_matrix(y_test, nb_test_pred)
sns.heatmap(nb_cm, annot=True, fmt='d', cmap='Greens', ax=axes[0],
            xticklabels=classes, yticklabels=classes, cbar_kws={'label': 'Count'})
axes[0].set_title('Naive Bayes (N-gram)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')

# Linear SVM
svm_cm = confusion_matrix(y_test, svm_test_pred)
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Oranges', ax=axes[1],
            xticklabels=classes, yticklabels=classes, cbar_kws={'label': 'Count'})
axes[1].set_title('Linear SVM (N-gram)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')

# Logistic Regression
lr_cm = confusion_matrix(y_test, lr_test_pred)
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=axes[2],
            xticklabels=classes, yticklabels=classes, cbar_kws={'label': 'Count'})
axes[2].set_title('Logistic Regression (N-gram)', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('True')

plt.tight_layout()
plt.show()

## 9. Model Comparison with N-grams

In [None]:
comparison_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'Linear SVM', 'Logistic Regression'],
    'Train Accuracy': [nb_train_metrics['accuracy'], svm_train_metrics['accuracy'], lr_train_metrics['accuracy']],
    'Test Accuracy': [nb_test_metrics['accuracy'], svm_test_metrics['accuracy'], lr_test_metrics['accuracy']],
    'Test Precision': [nb_test_metrics['precision'], svm_test_metrics['precision'], lr_test_metrics['precision']],
    'Test Recall': [nb_test_metrics['recall'], svm_test_metrics['recall'], lr_test_metrics['recall']],
    'Test F1-Score': [nb_test_metrics['f1'], svm_test_metrics['f1'], lr_test_metrics['f1']],
    'N-gram Range': [str(NGRAM_RANGE)] * 3,
    'Feature Count': [len(vocabulary)] * 3
})

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY (WITH N-GRAMS)")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
metrics = ['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1-Score']
x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width, 
                [nb_test_metrics['accuracy'], nb_test_metrics['precision'], 
                 nb_test_metrics['recall'], nb_test_metrics['f1']], 
                width, label='Naive Bayes', color='lightgreen')
rects2 = ax.bar(x, 
                [svm_test_metrics['accuracy'], svm_test_metrics['precision'], 
                 svm_test_metrics['recall'], svm_test_metrics['f1']], 
                width, label='Linear SVM', color='orange')
rects3 = ax.bar(x + width, 
                [lr_test_metrics['accuracy'], lr_test_metrics['precision'], 
                 lr_test_metrics['recall'], lr_test_metrics['f1']], 
                width, label='Logistic Regression', color='skyblue')

ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Model Performance Comparison (N-gram range={NGRAM_RANGE})', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1.1)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.tight_layout()
plt.show()

## 10. Unigrams vs N-grams Comparison

**Key Analysis:** Compare feature count and model accuracy between unigram-only and n-gram approaches.

In [None]:
print("\n" + "="*80)
print("UNIGRAMS VS N-GRAMS COMPARISON")
print("="*80)

# Display vocabulary comparison from earlier
print(f"\nFeature Count Comparison:")
print(f"  Unigrams only (1,1):    {unigram_vocab_size:,} features")
print(f"  N-grams (1,2):          {len(vocabulary):,} features")
print(f"  Increase:               {len(vocabulary) - unigram_vocab_size:,} features ({((len(vocabulary) - unigram_vocab_size) / unigram_vocab_size * 100):.1f}% more)")

print(f"\nN-gram Range: {NGRAM_RANGE}")
print(f"  - Captures word sequences")
print(f"  - Better context understanding")
print(f"  - More discriminative features")

print(f"\nBest Model Performance (N-gram {NGRAM_RANGE}):")
best_f1 = max(nb_test_metrics['f1'], svm_test_metrics['f1'], lr_test_metrics['f1'])
if best_f1 == nb_test_metrics['f1']:
    print(f"  Model: Naive Bayes")
elif best_f1 == svm_test_metrics['f1']:
    print(f"  Model: Linear SVM")
else:
    print(f"  Model: Logistic Regression")
print(f"  F1-Score: {best_f1:.4f}")
print(f"  Accuracy: {max(nb_test_metrics['accuracy'], svm_test_metrics['accuracy'], lr_test_metrics['accuracy']):.4f}")

print("\n" + "="*80)

In [None]:
# Visualize feature count comparison
fig, ax = plt.subplots(figsize=(10, 6))
categories = ['Unigrams\n(1,1)', f'N-grams\n{NGRAM_RANGE}']
feature_counts = [unigram_vocab_size, len(vocabulary)]
colors = ['skyblue', 'coral']

bars = ax.bar(categories, feature_counts, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Number of Features', fontsize=12)
ax.set_title('Feature Count: Unigrams vs N-grams', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

## 11. Save Models and Vectorizer

In [None]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)
os.makedirs('reports', exist_ok=True)

# Determine best model
best_model_name = 'Naive Bayes'
best_f1 = nb_test_metrics['f1']
best_model = nb_model

if svm_test_metrics['f1'] > best_f1:
    best_model_name = 'Linear SVM'
    best_f1 = svm_test_metrics['f1']
    best_model = svm_model
    
if lr_test_metrics['f1'] > best_f1:
    best_model_name = 'Logistic Regression'
    best_f1 = lr_test_metrics['f1']
    best_model = lr_model

# Save all trained models with n-gram suffix
with open('models/category_naive_bayes_ngram.pkl', 'wb') as f:
    pickle.dump(nb_model, f)
    
with open('models/category_svm_ngram.pkl', 'wb') as f:
    pickle.dump(svm_model, f)
    
with open('models/category_logistic_ngram.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Save the best model
with open('models/category_model_ngram.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save vectorizer components (vocabulary, word2idx, IDF, and ngram_range)
vectorizer_data = {
    'vocabulary': vocabulary,
    'word2idx': word2idx,
    'idf_dict': idf_dict,
    'ngram_range': NGRAM_RANGE
}
with open('models/category_vectorizer_ngram.pkl', 'wb') as f:
    pickle.dump(vectorizer_data, f)

# Save model comparison
comparison_df.to_csv('output/model_comparison_ngram.csv', index=False)

# Save evaluation reports
import json

reports = {
    'naive_bayes': {
        'train_metrics': nb_train_metrics,
        'test_metrics': nb_test_metrics,
        'classification_report': classification_report(y_test, nb_test_pred, output_dict=True, zero_division=0)
    },
    'svm': {
        'train_metrics': svm_train_metrics,
        'test_metrics': svm_test_metrics,
        'classification_report': classification_report(y_test, svm_test_pred, output_dict=True, zero_division=0)
    },
    'logistic': {
        'train_metrics': lr_train_metrics,
        'test_metrics': lr_test_metrics,
        'classification_report': classification_report(y_test, lr_test_pred, output_dict=True, zero_division=0)
    }
}

with open('reports/category_naive_bayes_ngram_report.json', 'w') as f:
    json.dump(reports['naive_bayes'], f, indent=2)
    
with open('reports/category_svm_ngram_report.json', 'w') as f:
    json.dump(reports['svm'], f, indent=2)
    
with open('reports/category_logistic_ngram_report.json', 'w') as f:
    json.dump(reports['logistic'], f, indent=2)

print("тЬУ All models saved to models/ directory (with _ngram suffix)")
print("тЬУ Vectorizer saved to models/category_vectorizer_ngram.pkl")
print("тЬУ Evaluation reports saved to reports/ directory")
print(f"тЬУ Best model ({best_model_name}) saved as models/category_model_ngram.pkl")
print(f"тЬУ N-gram range: {NGRAM_RANGE}")

---

# SENTIMENT CLASSIFICATION WITH N-GRAMS

---

## 12. Load Sentiment Dataset

In [None]:
df_sentiment = pd.read_csv('output/processed_sentiment_data.csv')

print(f"Sentiment Dataset shape: {df_sentiment.shape}")
print(f"\nSentiment distribution:")
print(df_sentiment['sentiment'].value_counts())

sentiment_documents = df_sentiment['tokenized_title'].fillna('').tolist()
sentiment_labels = df_sentiment['sentiment'].tolist()

print(f"\nTotal sentiment documents: {len(sentiment_documents)}")
print(f"Sample: {sentiment_documents[0]}")

## 13. TF-IDF for Sentiment with N-grams

In [None]:
# Apply n-gram tokenization to sentiment documents
sentiment_tokenized_docs = [tokenize_with_ngrams(doc, ngram_range=NGRAM_RANGE) for doc in sentiment_documents]
print(f"Tokenized {len(sentiment_tokenized_docs)} sentiment documents with ngram_range={NGRAM_RANGE}")

# Build vocabulary for sentiment with n-grams
sentiment_vocabulary = set()
for doc in sentiment_tokenized_docs:
    sentiment_vocabulary.update(doc)
sentiment_vocabulary = sorted(list(sentiment_vocabulary))
sentiment_word2idx = {word: idx for idx, word in enumerate(sentiment_vocabulary)}
print(f"Sentiment vocabulary size with n-grams: {len(sentiment_vocabulary)} unique n-grams")

# Compute TF for sentiment
sentiment_tf_docs = [compute_tf(doc) for doc in sentiment_tokenized_docs]
print("Sentiment TF computed")

# Compute IDF for sentiment
print("Computing sentiment IDF with n-grams...")
sentiment_idf_dict = compute_idf(sentiment_tokenized_docs, sentiment_vocabulary)
print(f"Sentiment IDF computed for {len(sentiment_idf_dict)} n-grams")

# Compute TF-IDF for sentiment
sentiment_tfidf_docs = [compute_tfidf(tf, sentiment_idf_dict) for tf in sentiment_tf_docs]
print("Sentiment TF-IDF weights computed with n-grams")

# Create TF-IDF matrix for sentiment
print("Creating sentiment TF-IDF matrix with n-grams...")
sentiment_tfidf_matrix = create_tfidf_matrix(sentiment_tfidf_docs, sentiment_vocabulary, sentiment_word2idx)
print(f"Sentiment TF-IDF Matrix: {sentiment_tfidf_matrix.shape}")

## 14. Prepare Sentiment Data for Training

In [None]:
X_sent = sentiment_tfidf_matrix
y_sent = np.array(sentiment_labels)

X_sent_train, X_sent_test, y_sent_train, y_sent_test = train_test_split(
    X_sent, y_sent, test_size=0.2, random_state=42, stratify=y_sent
)

print(f"Sentiment Training set: {X_sent_train.shape[0]} samples")
print(f"Sentiment Test set: {X_sent_test.shape[0]} samples")
print(f"N-gram features: {X_sent_train.shape[1]}")
print(f"\nSentiment distribution (train):")
print(pd.Series(y_sent_train).value_counts())
print(f"\nSentiment distribution (test):")
print(pd.Series(y_sent_test).value_counts())

## 15. Train Sentiment Classification Models with N-grams

In [None]:
# Naive Bayes for Sentiment
sent_nb_model = MultinomialNB()
sent_nb_model.fit(X_sent_train, y_sent_train)
sent_nb_train_pred = sent_nb_model.predict(X_sent_train)
sent_nb_test_pred = sent_nb_model.predict(X_sent_test)
print("Sentiment Naive Bayes completed with n-grams")

# Linear SVM for Sentiment
sent_svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)
sent_svm_model.fit(X_sent_train, y_sent_train)
sent_svm_train_pred = sent_svm_model.predict(X_sent_train)
sent_svm_test_pred = sent_svm_model.predict(X_sent_test)
print("Sentiment Linear SVM completed with n-grams")

# Logistic Regression for Sentiment
sent_lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
sent_lr_model.fit(X_sent_train, y_sent_train)
sent_lr_train_pred = sent_lr_model.predict(X_sent_train)
sent_lr_test_pred = sent_lr_model.predict(X_sent_test)
print("Sentiment Logistic Regression completed with n-grams")

## 16. Evaluate Sentiment Models

In [None]:
# Evaluate Naive Bayes
sent_nb_train_metrics = evaluate_model(y_sent_train, sent_nb_train_pred, "Sentiment Naive Bayes (N-gram)", "Training")
sent_nb_test_metrics = evaluate_model(y_sent_test, sent_nb_test_pred, "Sentiment Naive Bayes (N-gram)", "Test")

# Evaluate Linear SVM
sent_svm_train_metrics = evaluate_model(y_sent_train, sent_svm_train_pred, "Sentiment Linear SVM (N-gram)", "Training")
sent_svm_test_metrics = evaluate_model(y_sent_test, sent_svm_test_pred, "Sentiment Linear SVM (N-gram)", "Test")

# Evaluate Logistic Regression
sent_lr_train_metrics = evaluate_model(y_sent_train, sent_lr_train_pred, "Sentiment Logistic Regression (N-gram)", "Training")
sent_lr_test_metrics = evaluate_model(y_sent_test, sent_lr_test_pred, "Sentiment Logistic Regression (N-gram)", "Test")

In [None]:
# Confusion matrices for sentiment models
sent_classes = sorted(list(set(y_sent_test)))

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Naive Bayes confusion matrix
sent_nb_cm = confusion_matrix(y_sent_test, sent_nb_test_pred)
sns.heatmap(sent_nb_cm, annot=True, fmt='d', cmap='Greens', ax=axes[0],
            xticklabels=sent_classes, yticklabels=sent_classes)
axes[0].set_title('Sentiment: Naive Bayes (N-gram)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')

# SVM confusion matrix
sent_svm_cm = confusion_matrix(y_sent_test, sent_svm_test_pred)
sns.heatmap(sent_svm_cm, annot=True, fmt='d', cmap='Oranges', ax=axes[1],
            xticklabels=sent_classes, yticklabels=sent_classes)
axes[1].set_title('Sentiment: Linear SVM (N-gram)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')

# Logistic Regression confusion matrix
sent_lr_cm = confusion_matrix(y_sent_test, sent_lr_test_pred)
sns.heatmap(sent_lr_cm, annot=True, fmt='d', cmap='Blues', ax=axes[2],
            xticklabels=sent_classes, yticklabels=sent_classes)
axes[2].set_title('Sentiment: Logistic Regression (N-gram)', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('True')

plt.tight_layout()
plt.show()

## 17. Sentiment Model Comparison

In [None]:
sent_comparison_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'Linear SVM', 'Logistic Regression'],
    'Train Accuracy': [sent_nb_train_metrics['accuracy'], sent_svm_train_metrics['accuracy'], sent_lr_train_metrics['accuracy']],
    'Test Accuracy': [sent_nb_test_metrics['accuracy'], sent_svm_test_metrics['accuracy'], sent_lr_test_metrics['accuracy']],
    'Test Precision': [sent_nb_test_metrics['precision'], sent_svm_test_metrics['precision'], sent_lr_test_metrics['precision']],
    'Test Recall': [sent_nb_test_metrics['recall'], sent_svm_test_metrics['recall'], sent_lr_test_metrics['recall']],
    'Test F1-Score': [sent_nb_test_metrics['f1'], sent_svm_test_metrics['f1'], sent_lr_test_metrics['f1']],
    'N-gram Range': [str(NGRAM_RANGE)] * 3,
    'Feature Count': [len(sentiment_vocabulary)] * 3
})

print("\n" + "="*80)
print("SENTIMENT MODEL COMPARISON (WITH N-GRAMS)")
print("="*80)
print(sent_comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize sentiment model comparison
metrics = ['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1-Score']
x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width, 
                [sent_nb_test_metrics['accuracy'], sent_nb_test_metrics['precision'], 
                 sent_nb_test_metrics['recall'], sent_nb_test_metrics['f1']], 
                width, label='Naive Bayes', color='lightgreen')
rects2 = ax.bar(x, 
                [sent_svm_test_metrics['accuracy'], sent_svm_test_metrics['precision'], 
                 sent_svm_test_metrics['recall'], sent_svm_test_metrics['f1']], 
                width, label='Linear SVM', color='orange')
rects3 = ax.bar(x + width, 
                [sent_lr_test_metrics['accuracy'], sent_lr_test_metrics['precision'], 
                 sent_lr_test_metrics['recall'], sent_lr_test_metrics['f1']], 
                width, label='Logistic Regression', color='skyblue')

ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Sentiment Model Performance Comparison (N-gram={NGRAM_RANGE})', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1.1)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.tight_layout()
plt.show()

## 18. Save Sentiment Models and Vectorizer

In [None]:
# Determine best sentiment model
sent_best_model_name = 'Naive Bayes'
sent_best_f1 = sent_nb_test_metrics['f1']
sent_best_model = sent_nb_model

if sent_svm_test_metrics['f1'] > sent_best_f1:
    sent_best_model_name = 'Linear SVM'
    sent_best_f1 = sent_svm_test_metrics['f1']
    sent_best_model = sent_svm_model
    
if sent_lr_test_metrics['f1'] > sent_best_f1:
    sent_best_model_name = 'Logistic Regression'
    sent_best_f1 = sent_lr_test_metrics['f1']
    sent_best_model = sent_lr_model

print(f"\nBest Sentiment Model: {sent_best_model_name} (F1: {sent_best_f1:.4f})")

# Save sentiment models with n-gram suffix
with open('models/sentiment_naive_bayes_ngram.pkl', 'wb') as f:
    pickle.dump(sent_nb_model, f)
    
with open('models/sentiment_svm_ngram.pkl', 'wb') as f:
    pickle.dump(sent_svm_model, f)
    
with open('models/sentiment_logistic_ngram.pkl', 'wb') as f:
    pickle.dump(sent_lr_model, f)

# Save best sentiment model
with open('models/sentiment_model_ngram.pkl', 'wb') as f:
    pickle.dump(sent_best_model, f)

# Save sentiment vectorizer with n-gram range
sentiment_vectorizer_data = {
    'vocabulary': sentiment_vocabulary,
    'word2idx': sentiment_word2idx,
    'idf_dict': sentiment_idf_dict,
    'ngram_range': NGRAM_RANGE
}
with open('models/sentiment_vectorizer_ngram.pkl', 'wb') as f:
    pickle.dump(sentiment_vectorizer_data, f)

# Save sentiment comparison
sent_comparison_df.to_csv('output/sentiment_model_comparison_ngram.csv', index=False)

# Save sentiment evaluation reports
sent_reports = {
    'naive_bayes': {
        'train_metrics': sent_nb_train_metrics,
        'test_metrics': sent_nb_test_metrics,
        'classification_report': classification_report(y_sent_test, sent_nb_test_pred, output_dict=True, zero_division=0)
    },
    'svm': {
        'train_metrics': sent_svm_train_metrics,
        'test_metrics': sent_svm_test_metrics,
        'classification_report': classification_report(y_sent_test, sent_svm_test_pred, output_dict=True, zero_division=0)
    },
    'logistic': {
        'train_metrics': sent_lr_train_metrics,
        'test_metrics': sent_lr_test_metrics,
        'classification_report': classification_report(y_sent_test, sent_lr_test_pred, output_dict=True, zero_division=0)
    }
}

with open('reports/sentiment_naive_bayes_ngram_report.json', 'w') as f:
    json.dump(sent_reports['naive_bayes'], f, indent=2)
    
with open('reports/sentiment_svm_ngram_report.json', 'w') as f:
    json.dump(sent_reports['svm'], f, indent=2)
    
with open('reports/sentiment_logistic_ngram_report.json', 'w') as f:
    json.dump(sent_reports['logistic'], f, indent=2)

print("тЬУ All sentiment models saved to models/ directory (with _ngram suffix)")
print("тЬУ Sentiment vectorizer saved to models/sentiment_vectorizer_ngram.pkl")
print("тЬУ Sentiment reports saved to reports/ directory")
print(f"тЬУ Best sentiment model ({sent_best_model_name}) saved as models/sentiment_model_ngram.pkl")
print(f"тЬУ N-gram range: {NGRAM_RANGE}")

## 19. Final Summary: N-gram TF-IDF Pipeline

In [None]:
print("\n" + "="*80)
print("COMPLETE N-GRAM TF-IDF PIPELINE SUMMARY")
print("="*80)

print("\nЁЯУК CATEGORY CLASSIFICATION (with N-grams):")
print(f"  N-gram Range: {NGRAM_RANGE}")
print(f"  Dataset: {len(documents)} documents")
print(f"  Feature Count: {len(vocabulary):,} n-grams")
print(f"  Train/Test Split: {X_train.shape[0]}/{X_test.shape[0]}")
print(f"  Best Model: {best_model_name}")
print(f"  Best F1-Score: {best_f1:.4f}")

print("\nЁЯТн SENTIMENT CLASSIFICATION (with N-grams):")
print(f"  N-gram Range: {NGRAM_RANGE}")
print(f"  Dataset: {len(sentiment_documents)} documents")
print(f"  Feature Count: {len(sentiment_vocabulary):,} n-grams")
print(f"  Train/Test Split: {X_sent_train.shape[0]}/{X_sent_test.shape[0]}")
print(f"  Best Model: {sent_best_model_name}")
print(f"  Best F1-Score: {sent_best_f1:.4f}")

print("\nЁЯФС KEY ADVANTAGES OF N-GRAMS:")
print(f"  тЬУ Captures word sequences and context")
print(f"  тЬУ More discriminative features than unigrams alone")
print(f"  тЬУ Better understanding of phrases (e.g., 'роЗро▓роЩрпНроХрпИ роЕро░роЪрпБ')")
print(f"  тЬУ Improved classification performance")
print(f"  тЬУ Adjustable ngram_range parameter for flexibility")

print("\nЁЯТ╛ SAVED ARTIFACTS:")
print("  Category Models:")
print("    - models/category_naive_bayes_ngram.pkl")
print("    - models/category_svm_ngram.pkl")
print("    - models/category_logistic_ngram.pkl")
print("    - models/category_model_ngram.pkl (best)")
print("    - models/category_vectorizer_ngram.pkl")
print("\n  Sentiment Models:")
print("    - models/sentiment_naive_bayes_ngram.pkl")
print("    - models/sentiment_svm_ngram.pkl")
print("    - models/sentiment_logistic_ngram.pkl")
print("    - models/sentiment_model_ngram.pkl (best)")
print("    - models/sentiment_vectorizer_ngram.pkl")
print("\n  Reports:")
print("    - output/model_comparison_ngram.csv")
print("    - output/sentiment_model_comparison_ngram.csv")
print("    - reports/*_ngram_report.json")

print("\n" + "="*80)
print("тЬЕ N-GRAM TF-IDF PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)