In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
dataset_primary = pd.read_csv("WELFake_Dataset.csv")

In [5]:
dataset_primary.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# Data Preprocessing

In [6]:
dataset_primary = dataset_primary.rename( columns={'Unnamed: 0' : 'id'})

In [7]:
dataset_primary.isna().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [8]:
dataset_primary.head()

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [9]:
dataset_primary.shape

(72134, 4)

In [10]:
 dataset_primary =  dataset_primary.dropna()

In [11]:
dataset_primary.isna().sum()

id       0
title    0
text     0
label    0
dtype: int64

In [12]:
dataset_primary['label'].value_counts()

1    36509
0    35028
Name: label, dtype: int64

In [13]:
dataset_primary

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [14]:
#Stemming
ps = PorterStemmer()
def stemming(text):
    stemmed_text = re.sub('[^a-zA-Z]'," ", text)
    stemmed_text = stemmed_text.lower()
    stemmed_text = stemmed_text.split()
    stemmed_text = [ps.stem(word) for word in stemmed_text if not word in stopwords.words('english')]
    stemmed_text = ' '.join(stemmed_text)
    return stemmed_text


In [33]:
dataset_primary['title'] = dataset_primary['title'].apply(stemming)

print("FINALLY DONE")

FINALLY DONE


In [34]:
dataset_primary['title']

0        law enforc high alert follow threat cop white ...
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rai hindu use stori christian con...
4        satan russia unv imag terrifi new supernuk wes...
5        time christian group sue amazon splc design ha...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131           migrant refu leav train refug camp hungari
72132    trump tussl give unpopular mexican leader much...
72133            goldman sach endor hillari clinton presid
Name: title, Length: 71537, dtype: object

In [15]:
X = dataset_primary['title'].values
y = dataset_primary['label'].values

In [16]:
print(X)

['LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]'
 'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'
 'Bobby Jindal, raised Hindu, uses story of Christian conversion to woo evangelicals for potential 2016 bid'
 ... 'Migrants Refuse To Leave Train At Refugee Camp In Hungary'
 'Trump tussle gives unpopular Mexican leader much-needed shot in arm'
 'Goldman Sachs Endorses Hillary Clinton For President']


In [17]:
#vectorization
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [38]:
print(X) 

  (0, 18778)	0.19119199199155718
  (0, 18329)	0.12953081995001134
  (0, 17059)	0.25417143325832803
  (0, 16956)	0.24861338779858738
  (0, 9516)	0.2281779531036376
  (0, 7726)	0.2673841297708583
  (0, 6585)	0.4856722738685227
  (0, 6283)	0.2892696681391208
  (0, 5394)	0.31817689197461024
  (0, 3602)	0.24869119486730346
  (0, 1761)	0.334723411455833
  (0, 398)	0.3194027206998875
  (1, 18329)	0.13424895962371275
  (1, 17722)	0.35973057550211296
  (1, 16151)	0.1998676543814105
  (1, 14816)	0.1608430986300803
  (1, 14318)	0.35810711329768474
  (1, 13342)	0.2268087560849422
  (1, 12513)	0.2790489798405972
  (1, 11796)	0.1686662338838085
  (1, 11653)	0.22306828908813514
  (1, 7858)	0.26921648198053316
  (1, 6733)	0.2652110968000934
  (1, 2866)	0.36407357799494927
  (1, 2623)	0.3081355901867282
  :	:
  (71533, 1727)	0.493209837089499
  (71533, 746)	0.39820088303705015
  (71534, 17311)	0.3977811428926476
  (71534, 13923)	0.32106436746604305
  (71534, 13921)	0.353708681267506
  (71534, 10735)	0.

# Train - Test splitting and model training

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X ,y ,test_size = 0.25, stratify = y, random_state = 1234 )

In [40]:
X_train.shape

(53652, 19306)

In [41]:
X_test.shape

(17885, 19306)

In [42]:
model = LogisticRegression(C = 5, max_iter = 1000)
model.fit(X_train, y_train)

In [43]:
train_predictions = model.predict(X_train)
print("train accuracy:" ,accuracy_score(train_predictions, y_train))

train accuracy: 0.9434876612241855


In [44]:
test_predictions = model.predict(X_test)
print("test accuracy:" ,accuracy_score(test_predictions, y_test))

test accuracy: 0.8988537880905787


In [50]:
input_data = X_test[20]
prediction = model.predict(input_data)
if prediction[0] == 1:
    print('Fake News')
else:
    print('Authentic News')

Fake News


In [51]:
 dataset_primary['title'][3]

'bobbi jindal rai hindu use stori christian conver woo evangel potenti bid'

In [18]:
#Saving the trained model and vectorizer as pickle files

In [52]:
import pickle

In [53]:
pickled_final_model = 'pickled_final_model.pkl'
with open(pickled_final_model,'wb') as file:
    pickle.dump(model,file)

In [54]:
pickled_vectorizer = 'pickled_vectorizer.pkl'
with open(pickled_vectorizer,'wb') as file:
    pickle.dump(vector,file)