In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Load the cleaned CSV
df = pd.read_csv("cleaned_fake_news.csv")

#Drop rows with missing content
df = df.dropna(subset=["content"])

#Split into features and labels
X = df["content"]
y = df["label"]

#Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

#1. Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Logistic Regression Results:
Accuracy: 0.9905127625931782

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4284
           1       0.99      0.99      0.99      4570

    accuracy                           0.99      8854
   macro avg       0.99      0.99      0.99      8854
weighted avg       0.99      0.99      0.99      8854


Confusion Matrix:
 [[4254   30]
 [  54 4516]]


In [2]:
#2. Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB   
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)
print("Multinomial Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, mnb_pred))
print(classification_report(y_test, mnb_pred))
print(confusion_matrix(y_test, mnb_pred))

Multinomial Naive Bayes Results:
Accuracy: 0.9447707250960018
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      4284
           1       0.96      0.94      0.95      4570

    accuracy                           0.94      8854
   macro avg       0.94      0.95      0.94      8854
weighted avg       0.94      0.94      0.94      8854

[[4084  200]
 [ 289 4281]]


In [4]:
#3. Support Vector Machine model
from sklearn.svm import SVC
svm = SVC(kernel='linear')  # Linear kernel preferred for text data
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("Support Vector Machine Results:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))
print(confusion_matrix(y_test, svm_pred))

Support Vector Machine Results:
Accuracy: 0.9950304946916648
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4284
           1       1.00      0.99      1.00      4570

    accuracy                           1.00      8854
   macro avg       0.99      1.00      1.00      8854
weighted avg       1.00      1.00      1.00      8854

[[4272   12]
 [  32 4538]]


In [5]:
#4. Random Forest model
from sklearn.ensemble import RandomForestClassifier 
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
print(confusion_matrix(y_test, rf_pred))

Random Forest Results:
Accuracy: 0.9985317370679919
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284
           1       1.00      1.00      1.00      4570

    accuracy                           1.00      8854
   macro avg       1.00      1.00      1.00      8854
weighted avg       1.00      1.00      1.00      8854

[[4281    3]
 [  10 4560]]
