In [6]:
import re
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib
import os


In [7]:
train_path = "data/train.parquet"
test_path  = "data/test.parquet"

train_df = pd.read_parquet(train_path)
test_df  = pd.read_parquet(test_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df = train_df.dropna(subset=["text", "label"]).copy()
test_df  = test_df.dropna(subset=["text", "label"]).copy()

print("After dropna:")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (2936206, 8)
Test shape: (734039, 8)
After dropna:
Train shape: (2936206, 8)
Test shape: (734039, 8)


In [8]:
print("Unique labels:", train_df["label"].nunique())
display(train_df["label"].value_counts())


Unique labels: 3


label
Neutral            1507697
Slightly Biased     831621
Highly Biased       596888
Name: count, dtype: int64

In [9]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"\s+", " ", text)   # collapse whitespace
    return text

X_train = train_df["text"].astype(str).map(clean_text)
y_train = train_df["label"]

X_test  = test_df["text"].astype(str).map(clean_text)
y_test  = test_df["label"]

print("Train samples:", len(X_train))
print("Test samples:", len(X_test))
print("Example text:", X_train.iloc[0][:200])


Train samples: 2936206
Test samples: 734039
Example text: http://twitpic.com/6996z - Todays is Katie's birthday. This is what dad got her...(for the weekend


In [10]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 1),   # ✅ unigrams first (fast)
        max_features=30000,   # ✅ reduce features for speed
        min_df=5              # ✅ ignore rare words
    )),
    ("clf", LogisticRegression(
        solver="saga",        # ✅ supports multiclass + sparse + can be faster
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1,
        verbose=1
    ))
])

print(model)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=30000, min_df=5,
                                 stop_words='english')),
                ('clf',
                 LogisticRegression(class_weight='balanced', max_iter=1000,
                                    n_jobs=-1, solver='saga', verbose=1))])


In [11]:
print("Starting training...")
model.fit(X_train, y_train)
print("Training done!")


Starting training...




Epoch 1, change: 1
Epoch 2, change: 0.18550032
Epoch 3, change: 0.13342202
Epoch 4, change: 0.080878038
Epoch 5, change: 0.062664543
Epoch 6, change: 0.05670166
Epoch 7, change: 0.044103118
Epoch 8, change: 0.035301526
Epoch 9, change: 0.020614973
Epoch 10, change: 0.037809432
Epoch 11, change: 0.018289194
Epoch 12, change: 0.010621955
Epoch 13, change: 0.010203473
Epoch 14, change: 0.010101023
Epoch 15, change: 0.0062356083
Epoch 16, change: 0.0089830536
Epoch 17, change: 0.0060612349
Epoch 18, change: 0.0026581263
Epoch 19, change: 0.0017270819
Epoch 20, change: 0.0012780312
Epoch 21, change: 0.00097545237
Epoch 22, change: 0.00083280889
Epoch 23, change: 0.00054937167
Epoch 24, change: 0.00028750912
Epoch 25, change: 0.00041992446
Epoch 26, change: 0.00021588791
Epoch 27, change: 0.0002226491
Epoch 28, change: 0.00024482373
Epoch 29, change: 0.00011933025
convergence after 30 epochs took 83 seconds
Training done!


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7958078521713424
                 precision    recall  f1-score   support

  Highly Biased       0.75      0.82      0.78    148777
        Neutral       0.90      0.86      0.88    377055
Slightly Biased       0.65      0.66      0.66    208207

       accuracy                           0.80    734039
      macro avg       0.77      0.78      0.77    734039
   weighted avg       0.80      0.80      0.80    734039



In [14]:
joblib.dump(model, "models/tfidf_logreg.joblib")
print("Model saved.")


Model saved.
