In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import joblib


In [3]:
df = pd.read_csv("../data/final_dataset.csv")
print(df.shape)
df.head()


(9418, 2)


Unnamed: 0,text,label
0,konrad lorenz won the nobel prize,1
1,"She had everything she ever wanted, but she wa...",0
2,new york city has way fewer international arri...,1
3,The online gaming industry is a rapidly growin...,0
4,george washington carver invented peanut butter,1


In [4]:
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 7534
Test size: 1884


In [5]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape


(7534, 20000)

In [6]:
lr = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression Results:\n")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Results:

              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1166
           1       0.61      0.70      0.65       718

    accuracy                           0.72      1884
   macro avg       0.71      0.71      0.71      1884
weighted avg       0.73      0.72      0.72      1884



In [7]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=30,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

rf.fit(X_train_tfidf, y_train)

y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest Results:\n")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:

              precision    recall  f1-score   support

           0       0.82      0.58      0.68      1166
           1       0.54      0.80      0.64       718

    accuracy                           0.66      1884
   macro avg       0.68      0.69      0.66      1884
weighted avg       0.71      0.66      0.66      1884



Observations:

Logistic Regression provides strong recall and interpretability.

Random Forest captures non-linear patterns but is heavier.

Both models serve as baselines before deep learning.

In [8]:
joblib.dump(lr, "../trained_models/logreg.pkl")
joblib.dump(rf, "../trained_models/rf.pkl")
joblib.dump(tfidf, "../trained_models/tfidf.pkl")

print("Models saved")


Models saved
