In [44]:
import pandas as pd
import numpy as np

In [45]:
data = "WELFake_Dataset.csv"
df = pd.read_csv(data)
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [46]:
df.shape


(72134, 4)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [48]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [49]:
df['label'].value_counts()


label
1    37106
0    35028
Name: count, dtype: int64

In [50]:
df["title"] = df["title"].fillna("")
df["text"] = df["text"].fillna("")
df["content"] = df["title"] + " " + df["text"]

In [51]:
df = df[df["content"].str.strip().astype(bool)].reset_index(drop=True)

In [52]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,Russians steal research on Trump in hack of U....
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,WATCH: Giuliani Demands That Democrats Apolog...
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,Migrants Refuse To Leave Train At Refugee Camp...
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,Trump tussle gives unpopular Mexican leader mu...


Preprocessing

In [53]:
import re
import string

def preprocess_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = " ".join(text.split())

    return text



In [54]:
df["clean"] = df["content"].apply(preprocess_text)

In [55]:
df[["content", "clean"]].head()

Unnamed: 0,content,clean
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforcement on high alert following threat...
1,Did they post their votes for Hillary already?,did they post their votes for hillary already
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbelievable obama’s attorney general says mos...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",bobby jindal raised hindu uses story of christ...
4,SATAN 2: Russia unvelis an image of its terrif...,satan russia unvelis an image of its terrifyin...


Feature extraction

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=10000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.95,
    sublinear_tf=True,
)

X = vectorizer.fit_transform(df["clean"])
y = df["label"]


In [57]:
df.shape

(72134, 6)

train-test split

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [59]:
X_train.shape[0]

57707

In [60]:
X_test.shape[0]

14427

Model training 


1. logistic regression

In [61]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1500, C=1.5, solver="saga", random_state=42, n_jobs=-1)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

In [62]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)



accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)

y_proba_lr = lr.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba_lr)

In [63]:
print(accuracy)
print(precision)
print(recall)
print(f1)
print(y_proba_lr)
print(roc_auc)


0.9598669161987939
0.9554046858359957
0.9671203341867672
0.9612268130985067
[0.85198397 0.53400918 0.0144927  ... 0.99315422 0.78197417 0.01112536]
0.9921777445039794


In [64]:
from sklearn.model_selection import cross_val_score

cv = cross_val_score(lr, X_train, y_train, cv=5, scoring="f1", n_jobs=-1)
print(cv.mean(), "±", cv.std())

0.9612029007802295 ± 0.0016710767704381654


In [65]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix)

print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=["Real", "Fake"]))

[[6671  335]
 [ 244 7177]]
              precision    recall  f1-score   support

        Real       0.96      0.95      0.96      7006
        Fake       0.96      0.97      0.96      7421

    accuracy                           0.96     14427
   macro avg       0.96      0.96      0.96     14427
weighted avg       0.96      0.96      0.96     14427



2. training Linear svm model

In [66]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm = CalibratedClassifierCV(
    LinearSVC(C=1.0, max_iter=2000, random_state=42),
    cv=3
)

svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1:", f1_score(y_test, y_pred_svm))

y_proba_svm = svm.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba_svm))

Accuracy: 0.9654813890621751
Precision: 0.9622112431566298
Recall: 0.9710281633203073
F1: 0.9665995975855131
ROC-AUC: 0.9936403867045565


3. Training decision tree model

In [69]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=50, min_samples_split=5, min_samples_leaf=3, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

In [70]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1:", f1_score(y_test, y_pred_svm))

y_proba_svm = svm.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba_svm))

Accuracy: 0.9654813890621751
Precision: 0.9622112431566298
Recall: 0.9710281633203073
F1: 0.9665995975855131
ROC-AUC: 0.9936403867045565


4. training Random Forest model

In [72]:
from sklearn.ensemble import (RandomForestClassifier)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=80,
    min_samples_split=4,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

In [73]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1:", f1_score(y_test, y_pred_svm))

y_proba_svm = svm.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba_svm))

Accuracy: 0.9654813890621751
Precision: 0.9622112431566298
Recall: 0.9710281633203073
F1: 0.9665995975855131
ROC-AUC: 0.9936403867045565
