In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [9]:
df = pd.read_csv("/content/drive/MyDrive/fake-news-classifier/data/processed/all_cleaned_news.csv")
print(df.shape)
df.head()


(55138, 3)


Unnamed: 0,text,label,source
0,Says the Annies List political group supports ...,FAKE,liar
1,When did the decline of coal start? It started...,REAL,liar
2,"Hillary Clinton agrees with John McCain ""by vo...",REAL,liar
3,Health care reform legislation is likely to ma...,FAKE,liar
4,The economic turnaround started at the end of ...,REAL,liar


In [10]:
df.shape

(55138, 3)

In [11]:
df.info

In [12]:
label_map = {"FAKE":0, "REAL":1}
df["label_num"] = df["label"].map(label_map)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label_num"],
    test_size=0.2,
    stratify=df["label_num"],
    random_state=42
)


In [14]:
# Use only training and test data for TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [15]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr, target_names=["FAKE", "REAL"]))


Logistic Regression Results:
              precision    recall  f1-score   support

        FAKE       0.91      0.93      0.92      5594
        REAL       0.92      0.90      0.91      5434

    accuracy                           0.91     11028
   macro avg       0.91      0.91      0.91     11028
weighted avg       0.91      0.91      0.91     11028



In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf, target_names=["FAKE", "REAL"]))


In [None]:
def plot_cm(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["FAKE", "REAL"], yticklabels=["FAKE", "REAL"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()

plot_cm(y_test, y_pred_lr, "Logistic Regression")
plot_cm(y_test, y_pred_rf, "Random Forest")
