In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IIT Madras BS: Machine Learning Project (MLP)
## Comment Category Prediction Challenge
**Student:** Muhammad Bilal | **Roll Number:** 23F3001344

---
### **Project Roadmap & Methodology**
This notebook implements an end-to-end ML pipeline following the official project milestones:
* **M1:** Advanced EDA, handling 73% missing data, and Baseline Logistic Regression.
* **M2:** Stochastic Gradient Descent (SGD) with Hyperparameter Tuning.
* **M3:** Dimensionality Reduction (SVD) and Non-Linear Models (KNN, SVM, Naive Bayes).
* **M4:** Advanced Boosting (XGBoost, LightGBM), Multi-Layer Perceptron (MLP), and Tuned Ensembling.
* **M5:** Final Insights, Error Analysis, and Leaderboard Optimization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from scipy.sparse import hstack
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MaxAbsScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [None]:
# 1. Load Data
train = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/train.csv')
test = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/test.csv')
sample_sub = pd.read_csv('/kaggle/input/comment-category-prediction-challenge/Sample.csv')

# 2. Preprocessing & Imputation
train['comment'] = train['comment'].fillna("missing")
test['comment'] = test['comment'].fillna("missing")
for col in ['race', 'religion', 'gender']:
    train[col] = train[col].fillna(-1)
    test[col] = test[col].fillna(-1)

# 3. Visualization: Label Distribution
plt.figure(figsize=(10, 5))
sns.countplot(x='label', data=train, palette='magma')
plt.title('M1: Class Distribution (Checking for Imbalance)')
plt.show()

# 4. Statistical Text Analysis (Chi-Square)
tfidf_eda = TfidfVectorizer(max_features=2000, stop_words='english')
X_eda = tfidf_eda.fit_transform(train['comment'])
from sklearn.feature_selection import chi2
features = tfidf_eda.get_feature_names_out()
for label in sorted(train['label'].unique()):
    chi2score = chi2(X_eda, train['label'] == label)[0]
    top_indices = np.argsort(chi2score)[-5:]
    print(f"Category {label} Keywords: {[features[i] for i in top_indices]}")

In [None]:
# Feature Engineering
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(train['comment'])
X_test_tfidf = tfidf.transform(test['comment'])

# Train-Validation Split (Stratified)
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, train['label'], test_size=0.2, stratify=train['label'], random_state=42)

# Baseline Logic
baseline = LogisticRegression(max_iter=1000, class_weight='balanced')
baseline.fit(X_train, y_train)
print(f"M1 Baseline Accuracy: {accuracy_score(y_val, baseline.predict(X_val)):.4f}")

In [None]:
# SGD with Hyperparameter Tuning
sgd = SGDClassifier(loss='hinge', class_weight='balanced', random_state=42)
param_grid_sgd = {'alpha': [0.0001, 0.001, 0.01]}
grid_sgd = RandomizedSearchCV(sgd, param_grid_sgd, n_iter=3, cv=3, scoring='f1_macro')
grid_sgd.fit(X_train, y_train)

print(f"Best SGD Params: {grid_sgd.best_params_}")

In [None]:
# MILESTONE 3: Dimensionality Reduction & Diversified Models
# 1. Dimensionality Reduction (SVD)
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_val_svd = svd.transform(X_val)

# 2. Multinomial Naive Bayes (Requirement)
# Note: NB works directly on TF-IDF (must be non-negative)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print(f"M3 Naive Bayes Accuracy: {accuracy_score(y_val, nb_model.predict(X_val)):.4f}")

# 3. K-Nearest Neighbors (Requirement)
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_svd, y_train)

# 4. Support Vector Machine (Requirement)
svm = LinearSVC(class_weight='balanced', random_state=42).fit(X_train, y_train)

print(f"M3 SVM Accuracy: {accuracy_score(y_val, svm.predict(X_val)):.4f}")

In [None]:
# --- NEW CELL 7: Advanced Feature Engineering ---
from scipy.sparse import hstack

# 1. Extract Meta Features (Capital letters, Punctuation, Length)
def get_meta(df):
    m = pd.DataFrame(index=df.index)
    m['len'] = df['comment'].apply(len)
    m['caps'] = df['comment'].apply(lambda x: len(re.findall(r'[A-Z]', str(x))))
    m['punc'] = df['comment'].apply(lambda x: len(re.findall(r'[!?.]', str(x))))
    return MaxAbsScaler().fit_transform(m)

# 2. Advanced Text Vectorization (Words + Characters)
# Character n-grams catch slang and misspellings
vec = FeatureUnion([
    ('word', TfidfVectorizer(ngram_range=(1, 2), max_features=10000, sublinear_tf=True)),
    ('char', TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=10000, sublinear_tf=True))
])

X_text_adv = vec.fit_transform(train['comment'])
X_test_adv = vec.transform(test['comment'])

# 3. Combine everything
X_total = hstack([X_text_adv, get_meta(train)])
X_test_total = hstack([X_test_adv, get_meta(test)])

X_tr, X_va, y_tr, y_va = train_test_split(X_total, train['label'], test_size=0.2, stratify=train['label'], random_state=42)

In [None]:
# --- NEW CELL 8: Training the Power Models ---

# Calibrating SVM allows it to use 'Soft' voting for better accuracy
cal_svm = CalibratedClassifierCV(LinearSVC(class_weight='balanced', random_state=42), cv=3)
cal_svm.fit(X_tr, y_tr)

xgb_pwr = XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=6, random_state=42)
xgb_pwr.fit(X_tr, y_tr)

lgbm_pwr = LGBMClassifier(n_estimators=150, learning_rate=0.1, class_weight='balanced', random_state=42)
lgbm_pwr.fit(X_tr, y_tr)

print(f"SVM Accuracy: {accuracy_score(y_va, cal_svm.predict(X_va)):.4f}")
print(f"XGB Accuracy: {accuracy_score(y_va, xgb_pwr.predict(X_va)):.4f}")
print(f"LGBM Accuracy: {accuracy_score(y_va, lgbm_pwr.predict(X_va)):.4f}")

In [None]:
# --- NEW CELL 9: The Soft-Voting Ensemble ---
ensemble_final = VotingClassifier(
    estimators=[('svm', cal_svm), ('xgb', xgb_pwr), ('lgbm', lgbm_pwr)],
    voting='soft' # Much more accurate for high-level competition
)

ensemble_final.fit(X_tr, y_tr)
print(f"Final Ensemble Validation Accuracy: {accuracy_score(y_va, ensemble_final.predict(X_va)):.4f}")

In [None]:
# --- UPDATED CELL 10 ---
# Use the new ensemble and the new "Total" test features
final_preds = ensemble_final.predict(X_test_total)

submission = pd.DataFrame({
    'ID': sample_sub['ID'],
    'label': final_preds
})

submission.to_csv('booster_submission.csv', index=False)
print("Booster Submission Saved! Ready for Leaderboard.")