# Fake News Detector

### Importing dependencies

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\colby\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing stopwords in english that will be removed from dataset
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Loading the dataset

In [4]:
# Path to your dataset
file_path = r'C:\Users\colby\fake_real_news_dataset\fake_and_real_news_dataset.csv'

# Load the dataset into a pandas DataFrame
newsData = pd.read_csv(file_path)

# Display rows and columns of dataset
newsData.shape

(4594, 4)

In [5]:
# Print first 5 rows of dataset
newsData.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


### Cleaning dataset to be worked with

In [6]:
# Replace 'REAL' with 1 and 'FAKE' with 0 in the 'label' column
newsData['label'] = newsData['label'].replace({'REAL': 0, 'FAKE': 1})

# Show the updated DataFrame
newsData.head()

  newsData['label'] = newsData['label'].replace({'REAL': 0, 'FAKE': 1})


Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,0
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,0
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",0
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,0
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,0


In [7]:
# Checking for null values
newsData.isnull().sum()

idd      0
title    1
text     0
label    0
dtype: int64

In [8]:
# Replace null values with empty string
newsData = newsData.fillna(' ')

In [9]:
# Checking for null values
newsData.isnull().sum()

idd      0
title    0
text     0
label    0
dtype: int64

In [10]:
# Separate the article content with the label
X = newsData.drop(columns='label', axis = 1)
Y = newsData['label']

In [11]:
print(X)

             idd                                              title  \
0     Fq+C96tcx+  ‘A target on Roe v. Wade ’: Oklahoma bill maki...   
1     bHUqK!pgmv  Study: women had to drive 4 times farther afte...   
2     4Y4Ubf%aTi        Trump, Clinton clash in dueling DC speeches   
3     _CoY89SJ@K  Grand jury in Texas indicts activists behind P...   
4     +rJHoRQVLe  As Reproductive Rights Hang In The Balance, De...   
...          ...                                                ...   
4589  ukZm6JTO#x                 Russia Calls the War Party's Bluff   
4590  yu0xKEiapJ  Bernie Sanders: The Democratic primary gave me...   
4591  c4Y370E_9c  Pipeline Police Strip Search Native Girl, Then...   
4592  bBbeuCUeMH  Currency Crisis: Alasdair MacLeod On The Vexed...   
4593  vE44sWBnd9                   Paper Tiger ISIS Digs Into Mosul   

                                                   text  
0     UPDATE: Gov. Fallin vetoed the bill on Friday....  
1     Ever since Texas laws clo

In [12]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
4589    1
4590    1
4591    1
4592    1
4593    1
Name: label, Length: 4594, dtype: int64


### Stemming Procedure

Process of reducing a word to its root word

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

Apply and test stemming function

In [15]:
newsData['title'] = newsData['title'].apply(stemming)

In [16]:
newsData.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,target roe v wade oklahoma bill make feloni pe...,UPDATE: Gov. Fallin vetoed the bill on Friday....,0
1,bHUqK!pgmv,studi women drive time farther texa law close ...,Ever since Texas laws closed about half of the...,0
2,4Y4Ubf%aTi,trump clinton clash duel dc speech,"Donald Trump and Hillary Clinton, now at the s...",0
3,_CoY89SJ@K,grand juri texa indict activist behind plan pa...,A Houston grand jury investigating criminal al...,0
4,+rJHoRQVLe,reproduct right hang balanc debat moder drop ball,WASHINGTON -- Forty-three years after the Supr...,0


In [17]:
# Separate the data and label
X = newsData['title'].values
Y = newsData['label'].values

In [18]:
print(X)
print(Y)

['target roe v wade oklahoma bill make feloni perform abort wait governor decis'
 'studi women drive time farther texa law close abort clinic'
 'trump clinton clash duel dc speech' ...
 'pipelin polic strip search nativ girl leav nake jail overnight'
 'currenc crisi alasdair macleod vex question dollar'
 'paper tiger isi dig mosul']
[0 0 0 ... 1 1 1]


### Convert text data to numerical data

In [19]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [20]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 32663 stored elements and shape (4594, 5755)>
  Coords	Values
  (0, 6)	0.27388374911654273
  (0, 494)	0.22742103551531123
  (0, 1278)	0.2852242090323138
  (0, 1853)	0.3445151105577041
  (0, 2165)	0.2852242090323138
  (0, 3040)	0.19959492122240022
  (0, 3519)	0.31719117010539327
  (0, 3724)	0.2773137474635281
  (0, 4331)	0.3445151105577041
  (0, 5063)	0.2678071118389003
  (0, 5536)	0.3445151105577041
  (0, 5538)	0.25576893258694955
  (1, 6)	0.3142663586544497
  (1, 940)	0.3953119148019772
  (1, 947)	0.2759410546741804
  (1, 1518)	0.3272789052355892
  (1, 1817)	0.3953119148019772
  (1, 2863)	0.282913657744894
  (1, 4934)	0.3272789052355892
  (1, 5115)	0.3041730433810631
  (1, 5169)	0.23322080250692212
  (1, 5672)	0.2671589148613326
  (2, 922)	0.4422693608878315
  (2, 941)	0.18437185382816187
  (2, 1253)	0.5481191157644255
  :	:
  (4590, 3933)	0.25549490665139213
  (4590, 4417)	0.2125958982397718
  (4590, 5441)	0.29154145916466

### Splitting dataset to training and test data

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

### Training the model with logistic regression

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train, Y_train)

### Evaluation
Accuracy Score

In [24]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [25]:
print('Accuracy score of the training data: ', training_data_accuracy)

Accuracy score of the training data:  0.9151020408163265


In [26]:
X_train_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_train_prediction, Y_test)

In [27]:
print('Accuracy score of the test data: ', test_data_accuracy)

Accuracy score of the test data:  0.8541893362350381


In [28]:
# ✅ Test title sets
real_test_titles = [
    "nasa announces launch of artemis ii moon mission",
    "fed raises interest rate to combat inflation",
    "supreme court upholds key environmental regulation",
    "pfizer receives fda approval for covid booster shot",
    "california faces worst wildfire season experts warn",
    "olympics 2024 hosted in paris with sustainability push",
    "biden signs bipartisan infrastructure bill into law",
    "tesla reports record revenue this quarter",
    "fda approves new pill for postpartum depression",
    "japan launches lunar probe on historic space mission"
]

fake_test_titles = [
    "earth is flat confirms nasa secret meeting",
    "aliens caught voting in u s presidential election",
    "covid vaccine turns people into 5g antennas",
    "time traveler from 2077 arrested for robot warning",
    "bigfoot sighted stealing groceries from walmart",
    "reptilian overlords control global governments",
    "scientists discover cancer cure in egyptian pyramid",
    "chemtrails used to control minds and weather",
    "queen elizabeth replaced by clone in secret lab",
    "vaccines contain chip that reads your thoughts"
]

# ✅ Combine titles and assign labels
test_titles = [("Real", t) for t in real_test_titles] + [("Fake", t) for t in fake_test_titles]

# ✅ Run predictions
print("\n🧪 Logistic Regression Predictions on Test Titles:\n")

for expected_label, title in test_titles:
    processed_title = stemming(title)
    vectorized = vectorizer.transform([processed_title])
    prediction = model.predict(vectorized)[0]
    predicted_label = "Real" if prediction == 0 else "Fake"
    print(f"TITLE: {title}")
    print(f"→ Predicted: {predicted_label} | Expected: {expected_label}\n")



🧪 Logistic Regression Predictions on Test Titles:

TITLE: nasa announces launch of artemis ii moon mission
→ Predicted: Fake | Expected: Real

TITLE: fed raises interest rate to combat inflation
→ Predicted: Fake | Expected: Real

TITLE: supreme court upholds key environmental regulation
→ Predicted: Real | Expected: Real

TITLE: pfizer receives fda approval for covid booster shot
→ Predicted: Fake | Expected: Real

TITLE: california faces worst wildfire season experts warn
→ Predicted: Fake | Expected: Real

TITLE: olympics 2024 hosted in paris with sustainability push
→ Predicted: Fake | Expected: Real

TITLE: biden signs bipartisan infrastructure bill into law
→ Predicted: Real | Expected: Real

TITLE: tesla reports record revenue this quarter
→ Predicted: Fake | Expected: Real

TITLE: fda approves new pill for postpartum depression
→ Predicted: Fake | Expected: Real

TITLE: japan launches lunar probe on historic space mission
→ Predicted: Fake | Expected: Real

TITLE: earth is fla