Complete Text Mining and Classification Pipeline for Fake News Detection
Module: LD7185 - Programming for AI

This script covers:
1. Data loading and exploration
2. Text preprocessing
3. Feature extraction (TF-IDF)
4. Rule-based classification
5. Machine learning models (Naive Bayes, Logistic Regression, Random Forest)
6. Model evaluation and visualization

Author: Okoh Collins
Date: December 2025

In [8]:
!python -m pip install pandas
!python -m pip install numpy
!python -m pip install matplotlib
!python -m pip install seaborn
!python -m pip install nltk
!python -m pip install re
!python -m pip install scikit-learn
!python -m pip install wordcloud



ERROR: Could not find a version that satisfies the requirement re (from versions: none)
ERROR: No matching distribution found for re




In [4]:
# ============================================================================
# SECTION 1: IMPORT LIBRARIES
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Machine learning models
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Evaluation metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, confusion_matrix, classification_report,
                            roc_auc_score, roc_curve)

# Visualization
from wordcloud import WordCloud

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

print("All libraries imported successfully!")


All libraries imported successfully!


In [5]:
#load dataset
real = pd.read_csv("news/True.csv")
fake = pd.read_csv("news/Fake.csv")

#check for missing values
print("Missing Values real:")
print(real.isnull().sum())
print("\nMissing Values fake:")
print(fake.isnull().sum())

real['label'] = 1
fake['label'] = 0

df = pd.concat([real, fake]).sample(frac=1).reset_index(drop=True)
df.head(100)


Missing Values real:
title      0
text       0
subject    0
date       0
dtype: int64

Missing Values fake:
title      0
text       0
subject    0
date       0
dtype: int64


Unnamed: 0,title,text,subject,date,label
0,Disrespectful Prick Ted Cruz Will Snub Presid...,Instead of respectfully attending President Ob...,News,"January 11, 2016",0
1,Obama meets with national security team on Syr...,WASHINGTON (Reuters) - President Barack Obama ...,politicsNews,"October 15, 2016",1
2,Iraq increases oil exports from south to make ...,BAGHDAD (Reuters) - Iraq said it was increasin...,worldnews,"October 21, 2017",1
3,SHOCKING POLL RESULTS In Primary Victories Ton...,There are no surprises with the results on the...,politics,"Mar 8, 2016",0
4,Pope urges EU to rediscover unity if it wants ...,"VATICAN CITY (Reuters) - Pope Francis, in a ma...",worldnews,"October 28, 2017",1
...,...,...,...,...,...
95,“FLUSH TARGET” Truck Will Travel To Every Stor...,This highly visible campaign has nothing to do...,left-news,"May 18, 2016",0
96,Tax bill's 'pass-through' rule will aid wealth...,WASHINGTON (Reuters) - Wealthy business owners...,politicsNews,"December 20, 2017",1
97,Evan McMullin Issues DIRE Warning For ALL Ame...,Independent conservative presidential candidat...,News,"December 11, 2016",0
98,DISTURBING TRUTH ABOUT How The UN Decides Whic...,Every American should know the corrupt UN has ...,politics,"Feb 16, 2016",0


In [8]:
plt.figure(figsize=(10, 6))

# Count values
counts = df[label].value_counts()

# Create bar plot
ax = sns.barplot(x=counts.index, y=counts.values, palette=['#2ecc71', '#e74c3c'])

# Customize
plt.title('Distribution of Fake vs Real News Articles', fontsize=16, fontweight='bold')
plt.xlabel('Article Type', fontsize=12)
plt.ylabel('Number of Articles', fontsize=12)

# Add value labels on bars
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', fontsize=12, fontweight='bold')

# Add percentage
total = counts.sum()
percentages = (counts / total * 100).round(1)
plt.xticks([0, 1], [f'Real\n({percentages[0]}%)', f'Fake\n({percentages[1]}%)'])

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Total articles: {total}")
print(f"Real news: {counts[0]} ({percentages[0]}%)")
print(f"Fake news: {counts[1]} ({percentages[1]}%)")
print("-" * 50)

NameError: name 'label' is not defined

<Figure size 1000x600 with 0 Axes>

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,title,text,subject,date,label,clean_text
0,Now Trump Wants to ‘Drain The Sewer’ So The I...,"Donald Trump is losing it. On Monday, he kicke...",News,"July 24, 2017",0,donald trump losing monday kicked morning perh...
1,"Trump, conservatives try to put aside bitterne...",WASHINGTON (Reuters) - Raw feelings and mistru...,politicsNews,"March 29, 2017",1,washington reuters raw feeling mistrust could ...
2,U.S. Senate intelligence panel wants to speak ...,WASHINGTON (Reuters) - The U.S. Senate Intelli...,politicsNews,"July 10, 2017",1,washington reuters u senate intelligence commi...
3,Venezuela's Maduro thanks Putin for support in...,MOSCOW (Reuters) - Venezuelan President Nicola...,worldnews,"October 4, 2017",1,moscow reuters venezuelan president nicolas ma...
4,(VIDEO) UN CLIMATE CHANGE FREAKS: “We should m...,What an evil bunch of freaks! The agenda is so...,left-news,"Apr 6, 2015",0,evil bunch freak agenda important see forest t...


In [7]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text'])
y = df['label']
print(X)
print(y)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5711712 stored elements and shape (44898, 5000)>
  Coords	Values
  (0, 1355)	0.05291536049402882
  (0, 4644)	0.09572053418179054
  (0, 2685)	0.04376182583305136
  (0, 2895)	0.027106142912220493
  (0, 2492)	0.05344941072686894
  (0, 2908)	0.0356709940265244
  (0, 3263)	0.04087564801983766
  (0, 4985)	0.030293208229366934
  (0, 4412)	0.4060310434393694
  (0, 1674)	0.28122080343639333
  (0, 2999)	0.13259870980594776
  (0, 3093)	0.04535563981269693
  (0, 731)	0.08540927700564435
  (0, 63)	0.06213730073833651
  (0, 2930)	0.10271624593172016
  (0, 4959)	0.2028900390163508
  (0, 228)	0.06477143968061233
  (0, 1580)	0.09437202474507965
  (0, 4519)	0.10233734478956967
  (0, 455)	0.1163518082208645
  (0, 3613)	0.08250714664635285
  (0, 2459)	0.5827775521812608
  (0, 4515)	0.04376182583305136
  (0, 1702)	0.05781184028416345
  (0, 3102)	0.03364216675339107
  :	:
  (44897, 4908)	0.06402708645391846
  (44897, 3264)	0.047443442539181736
  

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=300)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4703
           1       0.98      0.99      0.99      4277

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4703
           1       0.94      0.92      0.93      4277

    accuracy                           0.93      8980
   macro avg       0.94      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = LinearSVC(max_iter=5000)

svm_model.fit(X_train, y_train)

pred = svm_model.predict(X_test)
print(classification_report(y_test, pred))