In [2]:
import pandas as pd
df=pd.read_csv('merged_cleaned_dataset_balanced.csv')

## Vectorisation of text

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=100) 

X = tfidf.fit_transform(df["cleaned_text"].astype(str)) 


print("TF-IDF Shape:", X.shape)


feature_names = tfidf.get_feature_names_out() 
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names) 

print(tfidf_df.head())


TF-IDF Shape: (93552, 100)
   add  amp  article   as  attack  back  believe     bitch  bla     black  \
0  0.0  0.0      0.0  0.0     0.0   0.0      0.0  0.399436  0.0  0.814838   
1  0.0  0.0      0.0  0.0     0.0   0.0      0.0  0.000000  0.0  0.000000   
2  0.0  0.0      0.0  0.0     0.0   0.0      0.0  0.000000  0.0  0.000000   
3  0.0  0.0      0.0  0.0     0.0   0.0      0.0  0.000000  0.0  0.000000   
4  0.0  0.0      0.0  0.0     0.0   0.0      0.0  0.000000  0.0  0.000000   

   ...  user  white  wiki  wikipedia     woman  word  work  world  wrong  year  
0  ...   0.0    0.0   0.0        0.0  0.000000   0.0   0.0    0.0    0.0   0.0  
1  ...   0.0    0.0   0.0        0.0  0.000000   0.0   0.0    0.0    0.0   0.0  
2  ...   0.0    0.0   0.0        0.0  0.542115   0.0   0.0    0.0    0.0   0.0  
3  ...   0.0    0.0   0.0        0.0  0.441293   0.0   0.0    0.0    0.0   0.0  
4  ...   0.0    0.0   0.0        0.0  0.000000   0.0   0.0    0.0    0.0   0.0  

[5 rows x 100 columns]


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [5]:
y = df["label"]

# 🔹 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train class distribution:", np.bincount(y_train))
print("Test class distribution:", np.bincount(y_test))


Train class distribution: [38683 36158]
Test class distribution: [9647 9064]


In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training various traditional models

In [7]:
# Train Logistic Regression
model = LogisticRegression(solver="lbfgs", max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["Absent", "Present"]))


Logistic Regression Accuracy: 0.855432633210411
Classification Report:
               precision    recall  f1-score   support

      Absent       0.82      0.93      0.87      9647
     Present       0.91      0.78      0.84      9064

    accuracy                           0.86     18711
   macro avg       0.86      0.85      0.85     18711
weighted avg       0.86      0.86      0.85     18711



In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(n_estimators=50, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=["Absent", "Present"]))


Random Forest Accuracy: 0.8619528619528619
Classification Report:
               precision    recall  f1-score   support

      Absent       0.82      0.94      0.88      9647
     Present       0.92      0.78      0.85      9064

    accuracy                           0.86     18711
   macro avg       0.87      0.86      0.86     18711
weighted avg       0.87      0.86      0.86     18711



In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_model = SVC(kernel="linear", verbose= True,decision_function_shape="ovr", random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm, target_names=["Absent", "Present"]))


[LibSVM]SVM Accuracy: 0.8606701940035273
Classification Report:
               precision    recall  f1-score   support

      Absent       0.82      0.94      0.87      9647
     Present       0.92      0.78      0.84      9064

    accuracy                           0.86     18711
   macro avg       0.87      0.86      0.86     18711
weighted avg       0.87      0.86      0.86     18711

