In [1]:
import numpy as np #provides arrays
import pandas as pd #for data cleaning
import re  #for making patterns
from nltk.corpus import stopwords # the in for of in with these are stop words in English literature.
from nltk.stem.porter import PorterStemmer  # finding the stem word e.g. Loving Loved == Love
from sklearn.feature_extraction.text import TfidfVectorizer # converting the word into vector e.g. Love == [0,0]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
news_df = pd.read_csv('fake_news_train_clean.csv')

In [4]:
news_df.head(10)

Unnamed: 0.1,Unnamed: 0,title_txt,label
0,0,hous dem aid didnt even see comey letter jason...,1
1,1,flynn hillari clinton big woman campu breitba...,0
2,2,truth might get fire truth might get fire octo...,1
3,3,15 civilian kill singl us airstrik identifi vi...,1
4,4,iranian woman jail fiction unpublish stori wom...,1
5,5,jacki mason hollywood would love trump bomb no...,0
6,7,benoît hamon win french socialist parti presid...,0
7,9,backchannel plan ukrain russia courtesi trump ...,0
8,10,obama organ action partner soroslink ‘indivis ...,0
9,11,bbc comedi sketch real housew isi caus outrag ...,0


In [5]:
news_df.shape

(18285, 3)

In [6]:
news_df.isnull()

Unnamed: 0.1,Unnamed: 0,title_txt,label
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
18280,False,False,False
18281,False,False,False
18282,False,False,False
18283,False,False,False


In [7]:
news_df.isna().sum()

Unnamed: 0    0
title_txt     0
label         0
dtype: int64

In [12]:
#Therefore, here we can see that there exists no null value.
news_df['title_txt'][4000] #checking a random news article

'pat caddel trump white hous look like fort apach attack media polit class veteran polit strategist pat caddel talk american health care act ahca possibl effect 2018 midterm elect siriusxm host matt boyl monday breitbart news daili smart peopl washington understood polit time peopl said ‘well wouldnt hurt democrat countri love caddel said convent media wisdom obamacar time passag  listen sever elect say ‘oh cours republican doom continu refer predict mani 20 hous republican could face tougher   campaign 2018 ahca theyr depend caddel argu far ill tell fairli compet person idea what bill that talk point headlin thrown trump administr congression peopl clue what dont know answer problem rais  time failur republican white hous entir oper set narr made import done  —   cede ground oppon basi ‘well know precondit go hurt go cut go destroy medicaid whatev conced ground conced lot continu  im sure next elect go health care one way caddel ad depend whether get someth fix whether peopl perceiv d

In [24]:
#stemming the articles:
ps = PorterStemmer()

def stemming(title_txt):
    stemmed_content = re.sub('[^a-zA-Z]',' ', title_txt)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [26]:
news_df['title_txt']

0        hous dem aid didnt even see comey letter jason...
1        flynn hillari clinton big woman campu  breitba...
2        truth might get fire truth might get fire octo...
3        15 civilian kill singl us airstrik identifi vi...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
18280    rapper ti trump poster child white supremaci r...
18281    nfl playoff schedul matchup odd  new york time...
18282    maci said receiv takeov approach hudson bay  n...
18283    nato russia hold parallel exercis balkan nato ...
18284    keep f35 aliv   david swanson author activist ...
Name: title_txt, Length: 18285, dtype: object

In [27]:
#Vector conversion:
X = news_df['title_txt'].values
Y = news_df['label'].values

In [30]:
print(X)

['hous dem aid didnt even see comey letter jason chaffetz tweet hous dem aid didnt even see comey letter jason chaffetz tweet darrel lucu octob 30 2016 subscrib jason chaffetz stump american fork utah  imag courtesi michael jolley avail creativ commonsbi licens apolog keith olbermann doubt worst person world week–fbi director jame comey accord hous democrat aid look like also know secondworst person well turn comey sent nowinfam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe didnt hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen — jason chaffetz jasoninthehous octob 28 2016 cours know case  comey actua

In [31]:
print(Y)

[1 0 1 ... 0 1 1]


In [32]:
#Vectorization:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [33]:
print(X) #after vector conversion:

  (0, 142630)	0.048666288409831546
  (0, 142512)	0.009465190817547516
  (0, 141409)	0.043850614347817174
  (0, 141396)	0.07080431182576076
  (0, 141259)	0.03725813716202547
  (0, 141167)	0.01107568436312838
  (0, 140538)	0.050835754620152745
  (0, 139767)	0.038650186162916884
  (0, 139596)	0.02086921616054415
  (0, 139406)	0.01682285157545243
  (0, 139186)	0.01220509917841046
  (0, 139077)	0.029514728748615753
  (0, 138976)	0.012548398604460978
  (0, 138677)	0.01151606281854624
  (0, 137626)	0.025346411431608595
  (0, 137507)	0.031200821304044016
  (0, 136402)	0.021451953136138933
  (0, 134986)	0.06502161949531679
  (0, 133996)	0.03296838013137909
  (0, 133704)	0.016058735438742987
  (0, 132826)	0.037728884561656295
  (0, 132053)	0.010896891865925214
  (0, 132025)	0.042799752773695535
  (0, 131932)	0.13502534089832152
  (0, 131775)	0.07002977192214592
  :	:
  (18284, 9495)	0.08085898329563401
  (18284, 9170)	0.02077850583590922
  (18284, 9168)	0.02642082930036477
  (18284, 9111)	0.0301

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state=1)

In [37]:
X_train.shape

(14628, 157938)

In [38]:
X_test.shape

(3657, 157938)

In [40]:
#Training of the model:
model = LogisticRegression()
model.fit(X_train, y_train)

In [42]:
y_preds_train = model.predict(X_train)
print("Train Accuracy: ", accuracy_score(y_preds_train, y_train))

Train Accuracy:  0.9793546622914958


In [43]:
y_preds_test = model.predict(X_test)
print("Train Accuracy: ", accuracy_score(y_preds_test, y_test))

Train Accuracy:  0.9554279464041564


In [67]:
#prediction system:
input_data = X_test[1000]
prediction = model.predict(input_data)

if prediction[0] == 1:
    print("Fake News")
else:
    print("Real News")

Fake News


In [68]:
news_df['title_txt'][1000]

'william sister could step semifin showdown us open  new york time william sister face arthur ash stadium semifin long stori career serena venu william play unit state open fourth round quarterfin twice final twice never tournament penultim round   serena william   unit state open champion hope pass steffi graf open era record 22 grand slam singl titl dealt challeng path friday draw unit state open set begin monday face 2014 semifinalist ekaterina makarova first round may meet former 1 ana ivanov third round current 5 simona halep quarterfin venu william 6 highest rank five year land quarter draw 4 agnieszka radwanska william 36 reach semifin wimbledon last month silver medal mix doubl rio olymp surpris singl gold medalist mónica puig puerto rico gain   seed 32 sloan stephen withdrew friday morn third round could meet 3 garbiñ muguruza thrash     olymp 8 madison key 28 coco vandewegh also muguruza quarter could face third round roberta vinci last year surpris finalist bottom quarter 2 

In [60]:
news_df["title_txt"][4]

'iranian woman jail fiction unpublish stori woman stone death adulteri print iranian woman sentenc six year prison iran revolutionari guard search home found notebook contain fiction stori shed written woman stone death accord eurasia review  golrokh ebrahimi irae 35 wife polit prison arash sadeghi 36 serv 19year prison sentenc human right activist public report intellig unit revolutionari guard came arrest husband raid apart – without warrant – found draft stori ebrahimi irae written articl state one confisc draft stori stone women death adulteri – never publish never present anyon articl state narr follow stori protagonist watch movi stone women islam law adulteri'