INSTALLING KAGGLE AND DOWNLOADING DATASET

In [1]:
! pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
! cp /content/drive/MyDrive/Kaggle_API/kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets download abri08/real-time-news

Dataset URL: https://www.kaggle.com/datasets/abri08/real-time-news
License(s): CC0-1.0
Downloading real-time-news.zip to /content
  0% 0.00/1.08k [00:00<?, ?B/s]
100% 1.08k/1.08k [00:00<00:00, 1.49MB/s]


In [7]:
! unzip real-time-news.zip

Archive:  real-time-news.zip
  inflating: real_time_news_dataset.csv  


IMPORTING DEPENDENCIES

In [8]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

DATA PREPROCESSING

About the Dataset:

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:
           1: Fake news
           0: real News





In [12]:
import pandas as pd

In [13]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/real_time_news_dataset.csv')

Checking rows and column of dataset

In [14]:
news_dataset.shape

(10, 5)

In [15]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,1,COVID-19 Vaccine Causes Autism,John Doe,A controversial study claims a link between CO...,1
1,2,NASA Confirms Water on the Moon,Jane Smith,"In a groundbreaking discovery, NASA scientists...",0
2,3,Politician Found Stealing Votes,Anonymous,Social media is buzzing with rumors of vote ta...,1
3,4,Breakthrough in Cancer Research,Dr. Emily Roe,Scientists have announced a promising new trea...,0
4,5,Alien Sighting in Small Town,Alex Brown,Residents of a small town claim to have witnes...,1


In [16]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [17]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [18]:
print(news_dataset['content'])

0              John Doe COVID-19 Vaccine Causes Autism
1           Jane Smith NASA Confirms Water on the Moon
2            Anonymous Politician Found Stealing Votes
3        Dr. Emily Roe Breakthrough in Cancer Research
4              Alex Brown Alien Sighting in Small Town
5                Chris Johnson 5G Causes Health Issues
6           Michael Lee Economy Rebounds Post-Pandemic
7    Dr. Sarah Green Scientists Achieve Nuclear Fus...
8       Tabloid Weekly Celebrity Marries Alien Partner
9    Conspiracy Times Wildfires Blamed on Secret Go...
Name: content, dtype: object


In [19]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [20]:
print(X)
print(Y)

   id                                           title            author  \
0   1                  COVID-19 Vaccine Causes Autism          John Doe   
1   2                 NASA Confirms Water on the Moon        Jane Smith   
2   3                 Politician Found Stealing Votes         Anonymous   
3   4                 Breakthrough in Cancer Research     Dr. Emily Roe   
4   5                    Alien Sighting in Small Town        Alex Brown   
5   6                         5G Causes Health Issues     Chris Johnson   
6   7                  Economy Rebounds Post-Pandemic       Michael Lee   
7   8  Scientists Achieve Nuclear Fusion Breakthrough   Dr. Sarah Green   
8   9                 Celebrity Marries Alien Partner    Tabloid Weekly   
9  10   Wildfires Blamed on Secret Government Project  Conspiracy Times   

                                                text  \
0  A controversial study claims a link between CO...   
1  In a groundbreaking discovery, NASA scientists...   
2  Soc

STEMMING

In [21]:
from nltk.stem.porter import PorterStemmer

In [22]:
import re

In [23]:
from nltk.corpus import stopwords

In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
port_stem = PorterStemmer()

In [26]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

UPLOAD ALL THIS TO CONTENT COLUMN

In [27]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

PRINT AND THEN WE WILL SEE THAT CONTENT ARE IN LOWER CASE

In [28]:
print(news_dataset['content'])

0                    john doe covid vaccin caus autism
1                   jane smith nasa confirm water moon
2                   anonym politician found steal vote
3            dr emili roe breakthrough cancer research
4                    alex brown alien sight small town
5                      chri johnson g caus health issu
6              michael lee economi rebound post pandem
7    dr sarah green scientist achiev nuclear fusion...
8            tabloid weekli celebr marri alien partner
9    conspiraci time wildfir blame secret govern pr...
Name: content, dtype: object


In [29]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [30]:
print(X)

['john doe covid vaccin caus autism' 'jane smith nasa confirm water moon'
 'anonym politician found steal vote'
 'dr emili roe breakthrough cancer research'
 'alex brown alien sight small town' 'chri johnson g caus health issu'
 'michael lee economi rebound post pandem'
 'dr sarah green scientist achiev nuclear fusion breakthrough'
 'tabloid weekli celebr marri alien partner'
 'conspiraci time wildfir blame secret govern project']


In [31]:
print(Y)

[1 0 1 0 1 1 0 0 1 1]


In [32]:
Y.shape

(10,)

CONVERTING CONTENT INTO MEANINGFUL NUMBERS AS COMPUTER ONLY UNDERSTANT DIGITS

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [35]:
print(X)

  (0, 4)	0.41802398937415175
  (0, 9)	0.35535858163071754
  (0, 14)	0.41802398937415175
  (0, 15)	0.41802398937415175
  (0, 26)	0.41802398937415175
  (0, 52)	0.41802398937415175
  (1, 12)	0.408248290463863
  (1, 25)	0.408248290463863
  (1, 31)	0.408248290463863
  (1, 32)	0.408248290463863
  (1, 47)	0.408248290463863
  (1, 54)	0.408248290463863
  (2, 3)	0.4472135954999579
  (2, 19)	0.4472135954999579
  (2, 36)	0.4472135954999579
  (2, 48)	0.4472135954999579
  (2, 53)	0.4472135954999579
  (3, 6)	0.3642958904763434
  (3, 8)	0.42853734036956914
  (3, 16)	0.3642958904763434
  (3, 18)	0.42853734036956914
  (3, 40)	0.42853734036956914
  (3, 41)	0.42853734036956914
  (4, 1)	0.41802398937415175
  (4, 2)	0.35535858163071754
  :	:
  (6, 30)	0.408248290463863
  (6, 34)	0.408248290463863
  (6, 37)	0.408248290463863
  (6, 39)	0.408248290463863
  (7, 0)	0.3664870124440271
  (7, 6)	0.31154744282300645
  (7, 16)	0.31154744282300645
  (7, 20)	0.3664870124440271
  (7, 22)	0.3664870124440271
  (7, 33)	0.3

SPLITTING THE DATASET TO TRAINING AND TESTING DATA

In [36]:
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, stratify=Y, random_state=2)

TRAINING THE MODEL : LOGISTIC REGRESSION

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
model = LogisticRegression()

In [54]:
model.fit(X_train, Y_train)

EVALUATION

ACCURACY SCORE

In [55]:
from sklearn.metrics import accuracy_score

In [56]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [57]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [58]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [59]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.6


MAKING A PREDICTIVE SYSTEM

In [65]:
X_new = X_test[2]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [None]:
print(Y_test[0])

1
