Install & Import

In [16]:
import os, sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# 1. Setup paths
repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

In [17]:
train_csv = os.path.join(repo_root, 'data/processed/train.csv')
val_csv   = os.path.join(repo_root, 'data/processed/val.csv')

In [18]:
# 2. Load data
train = pd.read_csv(train_csv)
val   = pd.read_csv(val_csv)
X_train, y_train = train['comment_text'], train['toxic']
X_val,   y_val   = val  ['comment_text'], val  ['toxic']

In [19]:
# 3. Build pipeline with char-ngrams
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        analyzer='char_wb',
        ngram_range=(3,5),     # trigrams to 5-grams
        max_features=20000
    )),
    ('clf', LogisticRegression(
        C=1.0,
        max_iter=1000,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    ))
])

In [20]:
# 4. Train & evaluate
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
print("Char-ngram TF-IDF + LR report:\n")
print(classification_report(y_val, y_pred, digits=4))

Char-ngram TF-IDF + LR report:

              precision    recall  f1-score   support

           0     0.9867    0.9431    0.9644     28856
           1     0.6209    0.8797    0.7280      3059

    accuracy                         0.9370     31915
   macro avg     0.8038    0.9114    0.8462     31915
weighted avg     0.9516    0.9370    0.9417     31915

