IMPORTING IMPORTANT LIBRARIES

In [106]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

DATA COLLECTION

In [107]:
news_data = pd.read_csv('data/fake_or_real_news.csv')

In [108]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [109]:
news_data.shape

(6335, 4)

DATA PREPROCESSING

In [110]:
news_data.isnull().sum()
#no missing values

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [111]:
news_data.dtypes

Unnamed: 0     int64
title         object
text          object
label         object
dtype: object

In [112]:
#converting label dtypes from object to int
news_data['label']=news_data['label'].apply(lambda z: 1 if z=='FAKE' else 0)

In [113]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [114]:
#building model based on title and labels
news_data = news_data[['title','label']]

In [115]:
news_data.head()

Unnamed: 0,title,label
0,You Can Smell Hillary’s Fear,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,1
2,Kerry to go to Paris in gesture of sympathy,0
3,Bernie supporters on Twitter erupt in anger ag...,1
4,The Battle of New York: Why This Primary Matters,0


STEMMING

In [116]:
port_stemmer = PorterStemmer()

In [117]:
#downloading stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\deepanshu
[nltk_data]     pathak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [118]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [119]:
#helper function for stemming
def stem(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [120]:
news_data['title'] = news_data['title'].apply(stem)

In [121]:
news_data.head()

Unnamed: 0,title,label
0,smell hillari fear,1
1,watch exact moment paul ryan commit polit suic...,1
2,kerri go pari gestur sympathi,0
3,berni support twitter erupt anger dnc tri warn,1
4,battl new york primari matter,0


DATA SELECTION

In [122]:
x = news_data['title'].values
y = news_data['label'].values

In [123]:
x

array(['smell hillari fear',
       'watch exact moment paul ryan commit polit suicid trump ralli video',
       'kerri go pari gestur sympathi', ...,
       'anti trump protest tool oligarchi inform',
       'ethiopia obama seek progress peac secur east africa',
       'jeb bush suddenli attack trump matter'],
      shape=(6335,), dtype=object)

In [124]:
y

array([1, 1, 0, ..., 1, 0, 0], shape=(6335,))

VECTORIZATION

In [125]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

In [126]:
x = vectorizer.transform(x)

In [127]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 45882 stored elements and shape (6335, 6884)>
  Coords	Values
  (0, 2230)	0.5606827254461214
  (0, 2817)	0.31703295849951846
  (0, 5631)	0.7649346276717717
  (1, 1212)	0.3538286538491304
  (1, 2101)	0.44595444447238664
  (1, 3939)	0.3279355637571237
  (1, 4462)	0.26757273402295384
  (1, 4630)	0.24403179029804972
  (1, 4905)	0.29382045650093463
  (1, 5263)	0.29163264316148546
  (1, 5933)	0.3501812302002013
  (1, 6315)	0.13561903129801023
  (1, 6571)	0.21736822482437437
  (1, 6671)	0.27855534550881683
  (2, 2536)	0.5345737509664292
  (2, 2576)	0.3357697230253524
  (2, 3335)	0.41740393299177314
  (2, 4428)	0.37615749278439464
  (2, 6015)	0.5345737509664292
  (3, 215)	0.41916879208304547
  (3, 561)	0.2969312816565521
  (3, 1764)	0.3507454498930073
  (3, 2048)	0.4448394256850793
  (3, 5956)	0.27803614912134916
  (3, 6288)	0.3148734995784654
  :	:
  (6330, 5809)	0.3024865273895357
  (6331, 4474)	0.5790025183139886
  (6331, 4493)	0

MODEL TRAINING

In [128]:
#separating dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5, stratify=y)

In [129]:
model = LogisticRegression()
model.fit(x_train, y_train)

MODEL EVALUATION

In [130]:
#accuracy of training data
x_train_prediction = model.predict(x_train)
train_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [131]:
print('Accuracy score of training data: ', train_data_accuracy)

Accuracy score of training data:  0.9072612470402526


In [132]:
#accuracy of test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [133]:
print('Accuracy score of testing data: ', test_data_accuracy)

Accuracy score of testing data:  0.8113654301499605


PREDICTIVE SYSTEM

In [134]:
def prediction(x_new):
    pred = model.predict(x_new)
    if pred[0]==0:
        print("Real News")
    else:
        print("Fake News")

In [139]:
prediction(x_test[496])

Fake News


In [140]:
print(y_test[496])

1
