# Collecting data - webscraping

In [4]:
#Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import random

In [5]:
def list_urls(file_name):
    # Open the CSV file
    with open(file_name, 'r') as csvfile:
        # Create a CSV reader object
        csvreader = csv.reader(csvfile)
        # Convert CSV data into a list
        csv_data_list = list(csvreader)
        text_list = [' '.join(row) for row in csv_data_list]
    return text_list

In [6]:
fake_news_urls = list_urls('list_of_fake_news.csv')
real_news_urls = list_urls('list_of_real_news.csv')

In [7]:
fake_news_urls

['\ufeffurls',
 'https://edition.cnn.com/2024/04/19/africa/nigeria-chibok-girl-rescued-intl/index.html',
 'https://edition.cnn.com/2024/04/19/middleeast/iraq-explosion-military-base-intl/index.html',
 'https://edition.cnn.com/2024/04/19/india/india-russia-ukraine-one-mans-death-intl-hnk-ml/index.html',
 'https://edition.cnn.com/2024/04/16/europe/ukraine-missiles-air-defense-zelensky-israel-intl/index.html',
 'https://edition.cnn.com/2024/03/16/europe/russia-ukraine-vacuum-bomb-intl/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/04/21/china/china-spy-agency-public-profile-intl-hnk/index.html',
 'https://edition.cnn.com/2024/04/16/politics/us-israel-free-speech-what-matters/index.html',
 'https://edition.cnn.com/2024/04/10/politics/lara-trump-rnc-2020-election-fraud-claims/index.html',
 'https://edition.cnn.com/2024/04/21/us/oklahoma-double-murder-plot/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/01/22/us/joliet-chic

In [8]:
fake_news_urls.pop(0)
real_news_urls.pop(0)

'\ufeffurls'

In [9]:
fake_news_urls

['https://edition.cnn.com/2024/04/19/africa/nigeria-chibok-girl-rescued-intl/index.html',
 'https://edition.cnn.com/2024/04/19/middleeast/iraq-explosion-military-base-intl/index.html',
 'https://edition.cnn.com/2024/04/19/india/india-russia-ukraine-one-mans-death-intl-hnk-ml/index.html',
 'https://edition.cnn.com/2024/04/16/europe/ukraine-missiles-air-defense-zelensky-israel-intl/index.html',
 'https://edition.cnn.com/2024/03/16/europe/russia-ukraine-vacuum-bomb-intl/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/04/21/china/china-spy-agency-public-profile-intl-hnk/index.html',
 'https://edition.cnn.com/2024/04/16/politics/us-israel-free-speech-what-matters/index.html',
 'https://edition.cnn.com/2024/04/10/politics/lara-trump-rnc-2020-election-fraud-claims/index.html',
 'https://edition.cnn.com/2024/04/21/us/oklahoma-double-murder-plot/index.html?iid=cnn_buildContentRecirc_end_recirc',
 'https://edition.cnn.com/2024/01/22/us/joliet-chicago-house-killi

In [10]:
def get_article_body(doc):
    
    selection_class="article__content-container"
    articles=doc.find_all('div',{'class':selection_class})
    art = None
    
    for article in articles:
        art = article.text.strip()

    return art

In [11]:
def all_pages(urls):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# Let's create a dictionary to store data of all movies
    articles_dict={
        'raw_text':[]
    }
    
    for url in urls:
        try:
            response = requests.get(url, headers=header)
            response.raise_for_status()  # Raise an exception for bad response codes
        except requests.exceptions.RequestException as e:
            print("Error fetching URL:", e)
            continue  # Move to the next URL
        if response.status_code != 200:
            break
           
    # Parse using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')
        article_body = get_article_body(doc)
        articles_dict['raw_text'].append(article_body)   
        
    return pd.DataFrame(articles_dict)

In [12]:
fake_news = all_pages(fake_news_urls)
real_news = all_pages(real_news_urls)

fake_news['class'] = '0'
real_news['class'] = '1'

In [13]:
def del_empty(df):
    df = df[df['raw_text'].notna()]
    return df

fake_news = del_empty(fake_news)
real_news = del_empty(real_news)

In [14]:
fake_news

Unnamed: 0,raw_text,class
0,"Lagos, Nigeria\nCNN\n — \n \n\n\n ...",0
1,CNN\n — \n \n\n\n The Ir...,0
2,New Delhi\nCNN\n — \n \n\n\n ...,0
3,CNN\n — \n \n\n\n A lack...,0
4,CNN\n — \n \n\n\n Russia...,0
5,Editor’s Note: Sign up for CNN’s Meanwhile in ...,0
6,CNN\n — \n \n\n\n Pro-Pa...,0
7,CNN\n — \n \n\n\n The Re...,0
8,CNN\n — \n \n\n\n Tifany...,0
9,CNN\n — \n \n\n\n A man ...,0


In [15]:
real_news

Unnamed: 0,raw_text,class
0,CNN\n — \n \n\n\n Violen...,1
1,CNN\n — \n \n\n\n Ukrain...,1
2,CNN\n — \n \n\n\n Ukrain...,1
3,"Kyiv, Ukraine\nCNN\n — \n \n\n\n ...",1
4,CNN\n — \n \n\n\n The Za...,1
5,CNN\n — \n \n\n\nRussia destroyed t...,1
6,Kyiv\nCNN\n — \n \n\n\n ...,1
7,CNN\n — \n \n\n\n Kenya’...,1
8,"Kuala Lumpur, Malaysia — Two Malaysian navy he...",1
9,CNN\n — \n \n\n\n At lea...,1


In [16]:
all_news = pd.concat([fake_news, real_news])
all_news.shape

(79, 2)

In [17]:
# Shuffle the rows of the DataFrame
all_news_shuffled = all_news.sample(frac=1.0)

# Reset the index after shuffling
all_news_shuffled.reset_index(drop=True, inplace=True)

# Print the shuffled DataFrame
all_news_shuffled

Unnamed: 0,raw_text,class
0,CNN\n — \n \n\n\n The fi...,0
1,CNN\n — \n \n\n\n A lack...,0
2,CNN\n — \n \n\n\n Ukrain...,1
3,CNN\n — \n \n\n\n A high...,0
4,CNN\n — \n \n\n\n A Poli...,0
...,...,...
74,CNN\n — \n \n\n\n Firefi...,1
75,CNN\n — \n \n\n\n Six pe...,1
76,"Brisbane, Australia\nCNN\n — \n \n\...",0
77,"Kyiv, Ukraine\nCNN\n — \n \n\n\n ...",1


In [37]:
train_set, test_set = all_news_shuffled[20:], all_news_shuffled[:20]
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

# Clean data

In [27]:
#importing libraries
!pip install scikit-learn
!pip install nltk
import numpy as np 
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.1/774.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2024.5.10


In [47]:
#stopword removal and lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/ucloud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ucloud/nltk_data...


In [48]:
train_set.head()

Unnamed: 0,raw_text,class
0,Reuters\n — \n \n\n\n Mo...,1
1,CNN\n — \n \n\n\nSierra Leone’s Pre...,1
2,CNN\n — \n \n\n\nIndia’s Prime Mini...,0
3,"Lagos, Nigeria\nCNN\n — \n \n\n\n ...",0
4,Istanbul/London\nCNN\n — \n \n\n\n ...,0


In [49]:
train_X_non = train_set['raw_text']
train_y = train_set['class']
test_X_non = test_set['raw_text']
test_y = test_set['class']
train_X=[]
test_X=[]

In [50]:
train_X_non[0]

'Reuters\n        \xa0—\xa0\n    \n\n\n            More than 100 people died after a ferry boat sank off the northern coast of Mozambique, President Filipe Nyusi said on Monday, and almost 20 others were still missing.\n    \n\n            An official from the country’s Maritime Transport Institute (INTRASMAR) said the vessel carrying 130 passengers was an overloaded fishing boat and was not licensed to transport people.\n    \n\n            It was ferrying people from Lunga in Nampula province to Mozambique Island on Sunday, Lourenco Machado, an administrator of INTRASMAR, said on state television, adding that initial reports indicated that it was hit by a tidal wave.\n    \n\n            The passengers were reportedly fleeing a cholera outbreak, the Office of the Secretary of State for Nampula province said in a statement, adding that 10 people had been rescued and nearly 20 others were still missing.\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAlfredo Zuniga/UNICEF Mozambique/Reuters\n\n\n

In [51]:
#text pre processing
for i in range(0, len(train_X_non)):
    text = re.sub('[^a-zA-Z]', ' ', train_X_non[i])
    text = re.sub(r'CNN', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\xa0', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords)]
    text = ' '.join(text)
    train_X.append(text)

#text pre processing
for i in range(0, len(test_X_non)):
    text = re.sub('[^a-zA-Z]', ' ', test_X_non[i])
    text = re.sub(r'CNN', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\xa0', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords)]
    text = ' '.join(text)
    test_X.append(text)

In [52]:
train_X[10]

'kuala lumpur malaysia two malaysian navy helicopter collided mid air rehearsal naval parade tuesday killing crew member aboard navy said statement incident occurred lumut naval base western state perak local time navy said victim confirmed dead scene sent lumut naval base military hospital identification navy said video circulating local medium showed several helicopter flying formation one chopper rotor clipped another aircraft crashed ground local police confirmed footage genuine first responder inspect helicopter crash site lumut perak state malaysia april terence tan ministry communication information ap navy said would investigate cause collision defense minister mohamed khaled nordin said aircraft maritime operation helicopter fennec military chopper rehearsing parade celebrating th anniversary royal malaysian navy due held saturday effort underway verify identity crew member killed age told reporter story updated additional information'

# Creating a Fake News Classifier using TF-IDF

In [53]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

In [54]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 59, n_features: 5531


In [55]:
#transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)

print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 20, n_features: 5531


# Naive Bayes Classifier

In [56]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

#predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [62]:
print(metrics.classification_report(test_y, y_pred, target_names=['Fake_news', 'Real_news']))

              precision    recall  f1-score   support

   Fake_news       1.00      0.18      0.31        11
   Real_news       0.50      1.00      0.67         9

    accuracy                           0.55        20
   macro avg       0.75      0.59      0.49        20
weighted avg       0.78      0.55      0.47        20



In [60]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[2 9]
 [0 9]]


# Doing a Test Prediction on Reviews Classifier Using TF-IDF