<a href="https://colab.research.google.com/github/Farwakhan971/A.I-Lab/blob/main/AI_Fake_News_Prediction_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/drive/MyDrive/Fake news dataset/FakeNewsNet.csv', encoding='unicode_escape')

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-processing

In [6]:
news_dataset.shape

(23196, 5)

In [7]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,label
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [8]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
label              0
dtype: int64

In [9]:
news_dataset = news_dataset.fillna('')

In [10]:
news_dataset['content'] = news_dataset['title']+' '+news_dataset['source_domain']

In [11]:
print(news_dataset['content'])

0        Kandi Burruss Explodes Over Rape Accusation on...
1        People's Choice Awards 2018: The best red carp...
2        Sophia Bush Sends Sweet Birthday Message to 'O...
3        Colombian singer Maluma sparks rumours of inap...
4        Gossip Girl 10 Years Later: How Upper East Sid...
                               ...                        
23191    Pippa Middleton wedding: In case you missed it...
23192    Zayn Malik & Gigi Hadidâs Shocking Split: Wh...
23193    Jessica Chastain Recalls the Moment Her Mother...
23194    Tristan Thompson Feels "Dumped" After KhloÃ© K...
23195    Kelly Clarkson Performs a Medley of Kendrick L...
Name: content, Length: 23196, dtype: object


In [12]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset[['label']]

In [13]:
print(X)
print(Y)

                                                   title  \
0      Kandi Burruss Explodes Over Rape Accusation on...   
1      People's Choice Awards 2018: The best red carp...   
2      Sophia Bush Sends Sweet Birthday Message to 'O...   
3      Colombian singer Maluma sparks rumours of inap...   
4      Gossip Girl 10 Years Later: How Upper East Sid...   
...                                                  ...   
23191  Pippa Middleton wedding: In case you missed it...   
23192  Zayn Malik & Gigi Hadidâs Shocking Split: Wh...   
23193  Jessica Chastain Recalls the Moment Her Mother...   
23194  Tristan Thompson Feels "Dumped" After KhloÃ© K...   
23195  Kelly Clarkson Performs a Medley of Kendrick L...   

                                                news_url  \
0      http://toofab.com/2017/05/08/real-housewives-a...   
1      https://www.today.com/style/see-people-s-choic...   
2      https://www.etonline.com/news/220806_sophia_bu...   
3      https://www.dailymail.co.uk/news

Stemming:

Stemming is the process of reducing a word to its Root word

example:
reform, deform --> form

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [16]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [17]:
print(news_dataset['content'])

0        kandi burruss explod rape accus real housew at...
1        peopl choic award best red carpet look www tod...
2        sophia bush send sweet birthday messag one tre...
3        colombian singer maluma spark rumour inappropr...
4        gossip girl year later upper east sider shock ...
                               ...                        
23191    pippa middleton wed case miss pippa marri lace...
23192    zayn malik gigi hadid shock split chanc reunit...
23193    jessica chastain recal moment mother boyfriend...
23194    tristan thompson feel dump khlo kardashian ref...
23195    kelli clarkson perform medley kendrick lamar h...
Name: content, Length: 23196, dtype: object


In [18]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [19]:
print(X)

['kandi burruss explod rape accus real housew atlanta reunion video toofab com'
 'peopl choic award best red carpet look www today com'
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva www etonlin com'
 ...
 'jessica chastain recal moment mother boyfriend slap kick genit www justjar com'
 'tristan thompson feel dump khlo kardashian refus let move la home exclus www intouchweekli com'
 'kelli clarkson perform medley kendrick lamar humbl hit billboard music award www billboard com']


In [20]:
print(Y)

[1 1 1 ... 1 0 1]


In [21]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [22]:
print(X)

  (0, 13681)	0.21108934405813048
  (0, 12980)	0.29467349125048065
  (0, 10569)	0.2543303205934995
  (0, 10276)	0.23128301802698972
  (0, 10223)	0.30215839846863407
  (0, 6647)	0.3874939993343693
  (0, 5891)	0.2612462026749161
  (0, 4193)	0.3550616497873391
  (0, 2460)	0.05082186658865081
  (0, 1707)	0.3874939993343693
  (0, 651)	0.3073756565758506
  (0, 59)	0.2597097788538541
  (1, 14274)	0.10189043319535993
  (1, 12946)	0.36817044163275126
  (1, 10347)	0.37702288594702094
  (1, 9476)	0.23697414575567807
  (1, 7432)	0.33284454859410584
  (1, 2460)	0.08159478582438673
  (1, 2219)	0.42251489448694257
  (1, 1916)	0.3875096512688879
  (1, 1101)	0.33844294892026705
  (1, 729)	0.3083771474680274
  (2, 14274)	0.05440697837598742
  (2, 13118)	0.3043944712032482
  (2, 12381)	0.2330583052370701
  :	:
  (23194, 10393)	0.31860580642240854
  (23194, 8384)	0.28221547915154177
  (23194, 7230)	0.3087048153381826
  (23194, 6994)	0.26381957644897924
  (23194, 6797)	0.2878505190105094
  (23194, 6661)	0.1

Splitting the dataset to training & test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=2)

Training the Model: Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, Y_train)

In [26]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

Evaluation

accuracy score

In [27]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.884026729898685


In [28]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [29]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.847198275862069


Making a Predictive System

In [30]:
X_new = X_test[3]
prediction = model.predict(X_new)
print(prediction)
if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [31]:
print(Y_test[3])

0
