## Text Classification with N-grams and Preprocessing
This notebook performs text classification on a news headlines dataset using n-grams and spaCy preprocessing.

In [None]:
# -----------------------------------
# Imports and Initial Setup
# -----------------------------------
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [None]:
# -----------------------------------
# Text Preprocessing Function
# -----------------------------------
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [None]:
# -----------------------------------
# Load and Prepare Dataset
# -----------------------------------
df = pd.read_json('News_Category_Dataset.json', lines=True)[['headline', 'category']]

# Select relevant categories
selected_categories = ['COMEDY', 'SPORTS', 'CRIME', 'EDUCATION']
df_new = df[df['category'].isin(selected_categories)]

# Balance the dataset (equal samples for each category)
min_samples = df_new['category'].value_counts().min()
df_balanced = pd.concat([
    df_new[df_new.category == cat].sample(min_samples, random_state=2022)
    for cat in selected_categories
])

# Encode labels
category_map = {'COMEDY': 0, 'SPORTS': 1, 'CRIME': 2, 'EDUCATION': 3}
df_balanced['category_num'] = df_balanced['category'].map(category_map)

In [None]:
# -----------------------------------
# Original Text Classification (1-gram)
# -----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['headline'], df_balanced['category_num'], 
    test_size=0.2, stratify=df_balanced['category_num'], random_state=2023
)

pipeline_bow = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 1))),
    ('classifier', MultinomialNB())
])

pipeline_bow.fit(X_train, y_train)
y_pred = pipeline_bow.predict(X_test)
print("=== Classification Report: 1-gram ===")
print(classification_report(y_test, y_pred))

In [None]:
# -----------------------------------
# N-gram Model (1 to 2 grams)
# -----------------------------------
pipeline_bigrams = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

pipeline_bigrams.fit(X_train, y_train)
y_pred = pipeline_bigrams.predict(X_test)
print("=== Classification Report: 1-2 grams ===")
print(classification_report(y_test, y_pred))

In [None]:
# -----------------------------------
# N-gram Model (1 to 3 grams)
# -----------------------------------
pipeline_trigrams = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', MultinomialNB())
])

pipeline_trigrams.fit(X_train, y_train)
y_pred = pipeline_trigrams.predict(X_test)
print("=== Classification Report: 1-3 grams ===")
print(classification_report(y_test, y_pred))

In [None]:
# -----------------------------------
# Preprocess Headlines
# -----------------------------------
df_balanced['preprocessed_txt'] = df_balanced['headline'].apply(preprocess)

# Train/Test Split on Preprocessed Data
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['preprocessed_txt'], df_balanced['category_num'],
    test_size=0.2, stratify=df_balanced['category_num'], random_state=2023
)

In [None]:
# -----------------------------------
# Classification on Preprocessed Text (1-2 grams)
# -----------------------------------
pipeline_preprocessed = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

pipeline_preprocessed.fit(X_train, y_train)
y_pred = pipeline_preprocessed.predict(X_test)
print("=== Classification Report: Preprocessed Text (1-2 grams) ===")
print(classification_report(y_test, y_pred))