# A Fake News Detector Using TF-IDF in Python

# Webscraping

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import random

In [2]:
# Define a function to list all urls
def list_urls(file_name):
    # Open the CSV file
    with open(file_name, 'r') as csvfile:
        # Create a CSV reader object
        csvreader = csv.reader(csvfile)
        # Convert CSV data into a list
        csv_data_list = list(csvreader)
        text_list = [' '.join(row) for row in csv_data_list]
    return text_list

# Apply the list_urls function to csv files 
fake_news_urls = list_urls('list_of_fake_news.csv')
real_news_urls = list_urls('list_of_real_news.csv')

In [3]:
# Check the result => the first element is not a url
fake_news_urls

['\ufeffurls',
 'https://edition.cnn.com/2024/04/19/africa/nigeria-chibok-girl-rescued-intl/index.html',
 'https://edition.cnn.com/2024/04/19/middleeast/iraq-explosion-military-base-intl/index.html',
 'https://edition.cnn.com/2024/04/19/india/india-russia-ukraine-one-mans-death-intl-hnk-ml/index.html',
 'https://edition.cnn.com/2024/04/16/europe/ukraine-missiles-air-defense-zelensky-israel-intl/index.html',
 'https://edition.cnn.com/2024/03/16/europe/russia-ukraine-vacuum-bomb-intl/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/04/21/china/china-spy-agency-public-profile-intl-hnk/index.html',
 'https://edition.cnn.com/2024/04/16/politics/us-israel-free-speech-what-matters/index.html',
 'https://edition.cnn.com/2024/04/10/politics/lara-trump-rnc-2020-election-fraud-claims/index.html',
 'https://edition.cnn.com/2024/04/21/us/oklahoma-double-murder-plot/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/01/22/us/joliet-chic

In [4]:
# Pop out the first element
fake_news_urls.pop(0)
real_news_urls.pop(0)

'\ufeffurls'

In [5]:
# Check the result again => good!
fake_news_urls

['https://edition.cnn.com/2024/04/19/africa/nigeria-chibok-girl-rescued-intl/index.html',
 'https://edition.cnn.com/2024/04/19/middleeast/iraq-explosion-military-base-intl/index.html',
 'https://edition.cnn.com/2024/04/19/india/india-russia-ukraine-one-mans-death-intl-hnk-ml/index.html',
 'https://edition.cnn.com/2024/04/16/europe/ukraine-missiles-air-defense-zelensky-israel-intl/index.html',
 'https://edition.cnn.com/2024/03/16/europe/russia-ukraine-vacuum-bomb-intl/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/04/21/china/china-spy-agency-public-profile-intl-hnk/index.html',
 'https://edition.cnn.com/2024/04/16/politics/us-israel-free-speech-what-matters/index.html',
 'https://edition.cnn.com/2024/04/10/politics/lara-trump-rnc-2020-election-fraud-claims/index.html',
 'https://edition.cnn.com/2024/04/21/us/oklahoma-double-murder-plot/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/01/22/us/joliet-chicago-house-killi

In [6]:
# Define a function to get the article bodies from the urls (it’ll be used below)
def get_article_body(doc):
    
    selection_class="article__content-container"
    articles=doc.find_all('div',{'class':selection_class})
    art = None
    
    for article in articles:
        art = article.text.strip()

    return art

In [7]:
# Define a function for web-scraping
def all_pages(urls):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# Let's create a dictionary to store data of all raw texts  b
    articles_dict={
        'raw_text':[]
    }
    
    for url in urls:
        try:
            response = requests.get(url, headers=header)
            response.raise_for_status()  # Raise an exception for bad response codes
        except requests.exceptions.RequestException as e:
            print("Error fetching URL:", e)
            continue  # Move to the next URL
        if response.status_code != 200:
            break
           
    # Parse using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')
        article_body = get_article_body(doc)
        articles_dict['raw_text'].append(article_body)   
        
    return pd.DataFrame(articles_dict)

In [8]:
#apply the web-scraping function to collected urls
fake_news = all_pages(fake_news_urls)
real_news = all_pages(real_news_urls)

#label fake news as '0' and real news as '1'
fake_news['class'] = '0'
real_news['class'] = '1'

In [9]:
# Define a function to delete the rows where 'raw_text' is empty
def del_empty(df):
    df = df[df['raw_text'].notna()]
    return df

# Apply the del_empty function to current dataframes
fake_news = del_empty(fake_news)
real_news = del_empty(real_news)

In [10]:
# Check the results
fake_news

Unnamed: 0,raw_text,class
0,"Lagos, Nigeria\nCNN\n — \n \n\n\n ...",0
1,CNN\n — \n \n\n\n The Ir...,0
2,New Delhi\nCNN\n — \n \n\n\n ...,0
3,CNN\n — \n \n\n\n A lack...,0
4,CNN\n — \n \n\n\n Russia...,0
...,...,...
75,CNN\n — \n \n\n\n At lea...,0
76,CNN\n — \n \n\n\n As pro...,0
77,CNN\n — \n \n\n\n Warren...,0
78,CNN\n — \n \n\n\n When H...,0


In [11]:
real_news

Unnamed: 0,raw_text,class
0,CNN\n — \n \n\n\n Violen...,1
1,CNN\n — \n \n\n\n Ukrain...,1
2,CNN\n — \n \n\n\n Ukrain...,1
3,"Kyiv, Ukraine\nCNN\n — \n \n\n\n ...",1
4,CNN\n — \n \n\n\n The Za...,1
...,...,...
75,CNN\n — \n \n\n\n Maryla...,1
76,CNN\n — \n \n\n\n A fede...,1
77,CNN\n — \n \n\n\n Police...,1
78,CNN\n — \n \n\n\n At lea...,1


In [12]:
# Concatenate the dataframes of fake_news and real_news
all_news = pd.concat([fake_news, real_news])
all_news.shape

(160, 2)

In [13]:
# Shuffle the rows of the DataFrame
all_news_shuffled = all_news.sample(frac=1.0)

# Reset the index after shuffling
all_news_shuffled.reset_index(drop=True, inplace=True)

# Print the shuffled DataFrame
all_news_shuffled

Unnamed: 0,raw_text,class
0,CNN\n — \n \n\n\n A show...,0
1,CNN\n — \n \n\n\n Firefi...,1
2,CNN\n — \n \n\n\n Police...,1
3,CNN\n — \n \n\n\n A 10-y...,1
4,CNN\n — \n \n\n\n Russia...,0
...,...,...
155,CNN\n — \n \n\n\n One of...,1
156,CNN\n — \n \n\n\n An Ari...,0
157,CNN\n — \n \n\n\n A youn...,0
158,CNN\n — \n \n\n\n Violen...,1


In [14]:
# Set train set and test set
train_set, test_set = all_news_shuffled[40:], all_news_shuffled[:40]
# Reset the index 
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

# Preprocessing data

In [15]:
# Show the head of train set
train_set.head()

Unnamed: 0,raw_text,class
0,CNN\n — \n \n\n\n The Se...,1
1,Rafah\nCNN\n — \n \n\n\n ...,1
2,CNN\n — \n \n\n\n Christ...,0
3,CNN\n — \n \n\n\n Three ...,0
4,CNN\n — \n \n\n\n A mass...,1


In [16]:
# Assign raw texts and classes to variables
train_X_non = train_set['raw_text']
train_y = train_set['class']
test_X_non = test_set['raw_text']
test_y = test_set['class']
train_X=[]
test_X=[]

In [17]:
# Check the texts
train_X_non[0]

'CNN\n        \xa0—\xa0\n    \n\n\n            The Senate passed a foreign aid package Tuesday\xa0that includes money\xa0for Ukraine, Israel and the Indo-Pacific region.\n    \n\n            The House passed the aid package, in the form of four separate bills, on Saturday – months after the Senate first passed legislation with aid for Ukraine and Israel.\n    \n\n            The new legislation, which will now go to President Joe Biden’s desk for his signature, also includes a number\xa0of House GOP priorities, including sanctions on Iran, the seizure of frozen Russian sovereign assets and a measure that could lead to a nationwide ban of TikTok.\n    \n\n            The aid for Ukraine, Israel and the Indo-Pacific region\xa0adds\xa0up to about $95 billion – the same amount\xa0that the earlier\xa0Senate bill passed in February\xa0would have provided –\xa0with an adjustment that $10 billion in Ukraine economic assistance is in the form of a repayable loan\n    \n\n            Here’s what

In [18]:
#Importing libraries
!pip install nltk
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.1/775.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m6.7 MB/s[0m eta

In [19]:
# Stopword removal and lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/ucloud/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ucloud/nltk_data...


In [20]:
# Define a function for pre-processing
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]',' ', text)
    text = re.sub(r'CNN', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\xa0', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords)]
    text = ' '.join(text)
    return text

# Text pre-processing for train set and test set
train_X = [preprocess_text(text) for text in train_X_non]
test_X = [preprocess_text(text) for text in test_X_non]

In [21]:
# Check the texts again
train_X[0]

'senate passed foreign aid package tuesday includes money ukraine israel indo pacific region house passed aid package form four separate bill saturday month senate first passed legislation aid ukraine israel new legislation go president joe biden desk signature also includes number house gop priority including sanction iran seizure frozen russian sovereign asset measure could lead nationwide ban tiktok aid ukraine israel indo pacific region add billion amount earlier senate bill passed february would provided adjustment billion ukraine economic assistance form repayable loan included foreign aid package according summary provided house republican nearly billion ukraine legislation includes total nearly billion assist ukraine others region fight russia included earlier senate bill total billion would used replenish u weapon stockpile facility billion would fund current u military operation region nearly billion included bill would help ukraine buy advanced weapon system defense equipmen

# Finding features using TF-IDF

In [22]:
# import libraries
!pip install scikit-learn
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25

In [23]:
# TF-IDF
tf_idf = TfidfVectorizer()
# Apply TF-IDF to training data
X_train_tf = tf_idf.fit_transform(train_X)
# Apply TF-IDF to training data
X_train_tf = tf_idf.transform(train_X)

In [24]:
# Check the dimensions of data
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 120, n_features: 8037


In [25]:
# show the features of the first article
# Get the feature names
feature_names = tf_idf.get_feature_names_out()

# Get the first document vector
first_document_vector = X_train_tf[0]

# Create a DataFrame for the first document's TF-IDF values
df_tfidf = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

# Sort the DataFrame by TF-IDF values in descending order
df_tfidf_sorted = df_tfidf.sort_values(by=["tfidf"], ascending=False)

print(df_tfidf_sorted)

                tfidf
billion      0.494049
package      0.210755
aid          0.194330
tiktok       0.184410
ukraine      0.181203
...               ...
finavia      0.000000
financial    0.000000
finance      0.000000
finally      0.000000
firecracker  0.000000

[8037 rows x 1 columns]


In [26]:
# Transforme test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)

# Check the dimensions of data
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 40, n_features: 8037


# Naive Bayes Classifier

In [27]:
#import libraries
from sklearn.naive_bayes import MultinomialNB

In [28]:
# Naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

# Predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)
y_pred

array(['0', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1',
       '0', '1', '1', '0', '0', '1', '1', '0', '0', '1', '1', '1', '0',
       '0', '0', '0', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0',
       '1'], dtype='<U1')

# Results

In [29]:
#import library
from sklearn import metrics

In [30]:
# Report the confusion matrix
print("Confusion matrix:")
metrics.confusion_matrix(test_y, y_pred)
pd.crosstab(test_y, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Confusion matrix:


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11,9,20
1,3,17,20
All,14,26,40


In [31]:
# Report the results
print(metrics.classification_report(test_y, y_pred, target_names=['Fake_news', 'Real_news']))

              precision    recall  f1-score   support

   Fake_news       0.79      0.55      0.65        20
   Real_news       0.65      0.85      0.74        20

    accuracy                           0.70        40
   macro avg       0.72      0.70      0.69        40
weighted avg       0.72      0.70      0.69        40

