In [1]:
import pandas as pd

data = pd.read_csv('FakeNewsNet.csv')
print("Shape of the dataset: ", data.shape)
data.head()

Shape of the dataset:  (23196, 5)


Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [2]:
data.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [3]:
data = data.drop(["news_url", "tweet_num"], axis=1)
data['source_domain'] = data['source_domain'].fillna('Unknown')
data.isnull().sum()

title            0
source_domain    0
real             0
dtype: int64

In [4]:
data['real'].value_counts()

real
1    17441
0     5755
Name: count, dtype: int64

In [5]:
real_data = data[data['real'] == 1]
fake_data = data[data['real'] == 0]

real_sample = real_data.sample(n=len(fake_data), random_state=42)

balanced_data = pd.concat([real_sample, fake_data])
balanced_data.head()

Unnamed: 0,title,source_domain,real
4894,Fergie And Josh Duhamel Split After 8 Years Of...,www.huffingtonpost.com,1
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,www.hollywoodreporter.com,1
594,"Prince William Says He Is Still ""Working On"" a...",www.brides.com,1
28,WATCH: Kendall Jenner’s first-ever Adidas ad,www.channel24.co.za,1
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",www.msn.com,1


In [6]:
balanced_data['real'].value_counts()

real
1    5755
0    5755
Name: count, dtype: int64

In [7]:
balanced_data = balanced_data.drop(['source_domain'], axis=1)
balanced_data.head()

Unnamed: 0,title,real
4894,Fergie And Josh Duhamel Split After 8 Years Of...,1
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,1
594,"Prince William Says He Is Still ""Working On"" a...",1
28,WATCH: Kendall Jenner’s first-ever Adidas ad,1
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",1


In [8]:
import nltk, string

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/skakibahammed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# def transform_text(text):
#   text = text.lower()
#   text = nltk.word_tokenize(text)

#   y = []
#   for i in text:
#     if i.isalnum():
#       y.append(i)

#   text = y[:]
#   y.clear()

#   for i in text:
#     if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation:
#       y.append(i)

#   text = y[:]
#   y.clear()

#   for i in text:
#     y.append(nltk.stem.PorterStemmer().stem(i))

#   return " ".join(y)

# balanced_data['transformed_text'] = balanced_data['title'].apply(transform_text)

# balanced_data.head()

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000,stop_words='english')

# x = tfidf.fit_transform(balanced_data['transformed_text']).toarray()
x = tfidf.fit_transform(balanced_data['title']).toarray()
y = balanced_data['real'].values

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [13]:
models = {
  "Logistic Regression": LogisticRegression(),
  "K-Nearest Neighbors": KNeighborsClassifier(),
  "Support Vector Machine": SVC(),
  "Decision Tree": DecisionTreeClassifier(),
  "Random Forest": RandomForestClassifier(),
  "XGBoost": XGBClassifier()
}

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results_list = []

for name, model in models.items():
  model.fit(X_train, Y_train)
  
  Y_pred = model.predict(X_test)
  
  accuracy = accuracy_score(Y_test, Y_pred)
  precision = precision_score(Y_test, Y_pred)
  recall = recall_score(Y_test, Y_pred)
  f1 = f1_score(Y_test, Y_pred)
  
  results_list.append({
    "Model": name,
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1
  })

In [15]:
results_df = pd.DataFrame(results_list)

results_df.set_index('Model', inplace=True)

results_df = results_df.round(4)

print("Model Performance Comparison:")
print(results_df)

Model Performance Comparison:
                        Accuracy  Precision  Recall  F1-Score
Model                                                        
Logistic Regression       0.7837     0.7678  0.8265    0.7961
K-Nearest Neighbors       0.6620     0.6228  0.8580    0.7217
Support Vector Machine    0.7854     0.7746  0.8180    0.7957
Decision Tree             0.7037     0.7485  0.6327    0.6857
Random Forest             0.7459     0.7704  0.7160    0.7422
XGBoost                   0.7450     0.7173  0.8265    0.7681
