<a href="https://colab.research.google.com/github/DEBASMITA-DASH/Fake-News-Prediction/blob/main/Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import re   #regular expression - needed for searching text in a document
from nltk.corpus import stopwords   #stopwords (a,the,I, who, what - not much imp words in a para) we need to remove such words in this context
from nltk.stem.porter import PorterStemmer    #stemming - remove prefix and suffix and returns the rootword
from sklearn.feature_extraction.text import TfidfVectorizer    # TfidfVectorizer is used to convert text to numerical values
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## 0 : Fake News
## 1 : Real News

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
print(stopwords.words('english'))   #the words to be removed

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Pre-processsing


In [8]:
df = pd.read_csv('/content/Fake Dataset.csv',encoding= 'unicode_escape')
df.head()

Unnamed: 0,id,title,text,source,label
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,0


In [9]:
df.isnull().sum()

id        0
title     0
text      0
source    0
label     0
dtype: int64

In [10]:
#Merging the title and source columns into a new column
df['content'] = df['title']+ ' : '+df['source']

In [11]:
df.head()

Unnamed: 0,id,title,text,source,label,content
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,0,Syria attack symptoms consistent with nerve ag...
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,0,Homs governor says U.S. attack caused deaths b...
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,0,Death toll from Aleppo bomb attack at least 11...
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,0,Aleppo bomb blast kills six Syrian state TV : nna
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,0,29 Syria Rebels Dead in Fighting for Key Alepp...


In [12]:
#Deleting column title and source

# Stemming
###### eg: actor,acting --> act

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]'  ,  ' ', content)   #removes everything apart from a-z & A-Z & spaces
    stemmed_content = stemmed_content.lower()  #converts all letters to lowercase
    stemmed_content = stemmed_content.split()  #all the words will be converted into a list

    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
                                                            #this performs stemming on all words except stopwords
    stemmed_content = ' '.join(stemmed_content)             #joining all the words
    return stemmed_content


In [15]:
df['content'] = df['content'].apply(stemming)

In [35]:
#Seperating the entire data and label into two diff dataframes

x = df['content'].values     # only df['content'] --> will give index and then the value. 
                             # df['content'].values --> will give only the value in an array. 
y = df['label'].values

In [39]:
#print(x)
#print(y)


In [36]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()      #tf - gives num values to only the repeated imp words
vectorizer.fit(x)

x = vectorizer.transform(x)

In [37]:
print(x)

  (0, 737)	0.33268695746858934
  (0, 675)	0.1123974588039333
  (0, 674)	0.4398385037739128
  (0, 471)	0.31752877227058074
  (0, 465)	0.4143530883600743
  (0, 156)	0.4398385037739128
  (0, 71)	0.154270763028172
  (0, 21)	0.4398385037739128
  (1, 612)	0.34738477945645097
  (1, 603)	0.22200068141769194
  (1, 471)	0.2507844619783276
  (1, 407)	0.30189759411838357
  (1, 334)	0.34738477945645097
  (1, 324)	0.20673629748416938
  (1, 301)	0.30189759411838357
  (1, 211)	0.34738477945645097
  (1, 188)	0.24114602484679362
  (1, 122)	0.3272563792892771
  (1, 95)	0.34738477945645097
  (1, 71)	0.12184316409612725
  (2, 702)	0.47437284668556384
  (2, 471)	0.4933331959701538
  (2, 394)	0.3389939588680877
  (2, 188)	0.47437284668556384
  (2, 99)	0.2929163001914418
  :	:
  (800, 224)	0.5271482201641406
  (800, 36)	0.18014237842437827
  (801, 676)	0.2680381438230018
  (801, 599)	0.39474553415539787
  (801, 413)	0.3291797399815204
  (801, 381)	0.1517457067444092
  (801, 270)	0.5492975843885451
  (801, 130

##Splitting training and testing data

In [60]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state=50)

In [53]:
#Training Model
model = LogisticRegression();

In [54]:
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

###Evaluation
**Accuracy Scores of training and testing data**

In [61]:
#Accuracy of Training data
x_train_pred = model.predict(x_train)
training_accuracy = accuracy_score(x_train_pred, y_train)
training_accuracy

0.708185053380783

In [62]:
#Accuracy of Testing Data
x_test_predict = model.predict(x_test)   
testing_accuracy = accuracy_score(x_test_predict, y_test)
testing_accuracy

0.7024793388429752

##Making a Predictive System

In [70]:
x_new = x_test[51]

prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==1):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Fake


In [71]:
#Checking whether model has predicted correctly or not
print(y_test[51])

0
