## Loading all necessary libraries and modules

In [1]:
import pandas as pd
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Downloading necessary NLTK data

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/skakibahammed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skakibahammed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the dataset

In [3]:
data = pd.read_csv('FakeNewsNet.csv')
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


## Preprocessing

In [4]:
data = data.drop(["news_url", "tweet_num"], axis=1)
data['source_domain'] = data['source_domain'].fillna('Unknown')

## Balancing the data

In [5]:
real_data = data[data['real'] == 1]
fake_data = data[data['real'] == 0]
real_sample = real_data.sample(n=len(fake_data), random_state=42)
balanced_data = pd.concat([real_sample, fake_data])

balanced_data = balanced_data.drop(['source_domain'], axis=1)

## Text transformation

In [6]:
def transform_text(text):
  text = text.lower()
  text = nltk.word_tokenize(text)
  y = [i for i in text if i.isalnum()]
  text = [i for i in y if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation]
  y = [nltk.stem.PorterStemmer().stem(i) for i in text]
  return " ".join(y)

balanced_data['transformed_text'] = balanced_data['title'].apply(transform_text)

## Model Training and Evaluation

In [7]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
preprocessor = ColumnTransformer(transformers=[("text", tfidf, 'transformed_text')])

In [8]:
x = balanced_data[['transformed_text']]
y = balanced_data['real'].values

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2, stratify=y)

In [10]:
models = {
  "Logistic Regression": LogisticRegression(),
  "K-Nearest Neighbors": KNeighborsClassifier(),
  "Support Vector Machine": SVC(),
  "Decision Tree": DecisionTreeClassifier(),
  "Random Forest": RandomForestClassifier(),
  "XGBoost": XGBClassifier()
}

In [11]:
from sklearn.model_selection import GridSearchCV

param_grids = {
  "Logistic Regression": {
    'model__C': [0.1, 1.0, 10],
    'model__solver': ['liblinear']
  },
  "K-Nearest Neighbors": {
    'model__n_neighbors': [5, 7, 9],
    'model__weights': ['uniform', 'distance']
  },
  "Support Vector Machine": {
    'model__C': [0.1, 1.0, 10],
    'model__gamma': ['scale', 'auto'],
    'model__kernel': ['rbf']
  },
  "Decision Tree": {
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10]
  },
  "Random Forest": {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None]
  },
  "XGBoost": {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5, 7]
  }
}

In [12]:
results_list = []

for name, model in models.items():
  pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
  ])

  param_grid = param_grids[name]

  print(f"Running GridSearchCV for {name}...")
  grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
  )
  
  grid_search.fit(X_train, Y_train)

  best_model = grid_search.best_estimator_
  
  Y_pred = best_model.predict(X_test)
  
  accuracy = accuracy_score(Y_test, Y_pred)
  precision = precision_score(Y_test, Y_pred)
  recall = recall_score(Y_test, Y_pred)
  f1 = f1_score(Y_test, Y_pred)
  
  results_list.append({
    "Model": name,
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1,
    "Best Params": grid_search.best_params_
  })

Running GridSearchCV for Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Running GridSearchCV for K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Running GridSearchCV for Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Running GridSearchCV for Decision Tree...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Running GridSearchCV for Random Forest...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Running GridSearchCV for XGBoost...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [13]:
results_df = pd.DataFrame(results_list)
results_df = results_df.round(4)
print("\nFinal Model Performance after GridSearchCV:")
print(results_df)


Final Model Performance after GridSearchCV:
                    Model  Accuracy  Precision  Recall  F1-Score  \
0     Logistic Regression    0.7750     0.7567  0.8106    0.7827   
1     K-Nearest Neighbors    0.6151     0.6063  0.6568    0.6305   
2  Support Vector Machine    0.7815     0.7576  0.8280    0.7912   
3           Decision Tree    0.6877     0.6388  0.8636    0.7344   
4           Random Forest    0.7268     0.6810  0.8532    0.7574   
5                 XGBoost    0.7372     0.7062  0.8123    0.7556   

                                         Best Params  
0    {'model__C': 1.0, 'model__solver': 'liblinear'}  
1  {'model__n_neighbors': 7, 'model__weights': 'd...  
2  {'model__C': 1.0, 'model__gamma': 'scale', 'mo...  
3  {'model__max_depth': 20, 'model__min_samples_s...  
4  {'model__max_depth': 20, 'model__n_estimators'...  
5  {'model__learning_rate': 0.1, 'model__max_dept...  
