In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
news_df = pd.read_csv('WELFake_Dataset.csv')

In [None]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [7]:
news_df.shape

(72134, 4)

In [8]:
news_df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [9]:
news_df = news_df.fillna(' ')

In [10]:
news_df.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [11]:
news_df['content'] = news_df['title']+" "+news_df['text']

In [12]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...


In [13]:
news_df['content'][72130]

' WATCH: Giuliani Demands That Democrats Apologize For Trump’s Racist Birtherism You know, because in fantasyland Republicans never questioned the citizenship of America s first black president.But that s exactly what they did for years and Donald Trump led the charge by going on or calling into Fox News every chance he could in order to demand President Obama s birth certificate.It was Trump who constantly claimed President Obama was born in Kenya even though Hawaii newspapers from the time announce Obama s birth.Trump and his birther fans were finally utterly humiliated when President Obama released his birth certificate to the public.And now that birtherism is coming back to bite Trump on the ass as he tries to convince black voters that he isn t a racist.Part of the reason why black voters reject Trump is because he questioned the citizenship of President Obama simply because he is black. But Trump wants everyone to pretend he didn t say any nasty things about President Obama and T

In [14]:
df = news_df.sample(20000)

In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\MOHD
[nltk_data]     SAAD\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
df['label'].value_counts()

label
1    10349
0     9651
Name: count, dtype: int64

In [17]:
# Stemming
ps = PorterStemmer()

def stemming(content):
  stemmend_content = re.sub('[^a-zA-Z]',' ',content)
  stemmend_content = stemmend_content.lower()
  stemmend_content = stemmend_content.split()
  stemmend_content = [ps.stem(word) for word in stemmend_content]
  stemmend_content = ' '.join(stemmend_content)
  return stemmend_content

In [18]:
news_df['content'] = news_df['content'].apply(stemming)

In [19]:
news_df['content'][72130]

'watch giuliani demand that democrat apolog for trump s racist birther you know becaus in fantasyland republican never question the citizenship of america s first black presid but that s exactli what they did for year and donald trump led the charg by go on or call into fox news everi chanc he could in order to demand presid obama s birth certif it wa trump who constantli claim presid obama wa born in kenya even though hawaii newspap from the time announc obama s birth trump and hi birther fan were final utterli humili when presid obama releas hi birth certif to the public and now that birther is come back to bite trump on the ass as he tri to convinc black voter that he isn t a racist part of the reason whi black voter reject trump is becaus he question the citizenship of presid obama simpli becaus he is black but trump want everyon to pretend he didn t say ani nasti thing about presid obama and trump pawn rudi giuliani is demand that democrat apolog for trump s birther giuliani who h

In [20]:
X = news_df['content'].values
y = news_df['label'].values

In [21]:
print(X)

['law enforc on high alert follow threat against cop and white on by blacklivesmatt and fyf terrorist video no comment is expect from barack obama member of the fyf or fukyoflag and blacklivesmatt movement call for the lynch and hang of white peopl and cop they encourag other on a radio show tuesday night to turn the tide and kill white peopl and cop to send a messag about the kill of black peopl in america one of the f yoflag organ is call sunshin she ha a radio blog show host from texa call sunshin s f ing opinion radio show a snapshot of her fyf lolatwhitefear twitter page at p m show that she wa urg support to call now fyf tonight we continu to dismantl the illus of white below is a snapshot twitter radio call invit fyf the radio show air at p m eastern standard time dure the show caller clearli call for lynch and kill of white peopl a minut clip from the radio show can be heard here it wa provid to breitbart texa by someon who would like to be refer to as hannib he ha alreadi rece

In [22]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [23]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 17105460 stored elements and shape (72134, 162252)>
  Coords	Values
  (0, 624)	0.008104853079307873
  (0, 940)	0.01691096126800681
  (0, 1284)	0.015370009700767984
  (0, 2133)	0.04643439802613155
  (0, 2221)	0.014990305076025775
  (0, 2223)	0.010982077028512754
  (0, 2765)	0.06676176206621502
  (0, 2786)	0.01790827461989413
  (0, 3617)	0.026470740558609027
  (0, 3956)	0.008914371512914033
  (0, 4002)	0.024318548154627
  (0, 4267)	0.02112557644629626
  (0, 4338)	0.044751401601712586
  (0, 4611)	0.017662275118483905
  (0, 4850)	0.013400979479553066
  (0, 4866)	0.022008514346794607
  (0, 5225)	0.007145538020182149
  (0, 5392)	0.12121295758995135
  (0, 6017)	0.012920862858965154
  (0, 6511)	0.05072354187596246
  (0, 6849)	0.014066338460660776
  (0, 7387)	0.0808445855682592
  (0, 8063)	0.0331926851236094
  (0, 8441)	0.11204214049213164
  (0, 8699)	0.04171543846248927
  :	:
  (72133, 141128)	0.01732341252445897
  (72133, 141174)	0

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=1)

In [25]:
X_train.shape

(57707, 162252)

In [26]:
X_test.shape

(14427, 162252)

In [27]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [28]:
lr_pred_train = model.predict(X_train)
print('Train Accuracy :-'),accuracy_score(lr_pred_train,y_train)*100

Train Accuracy :-


(None, 95.92597085275617)

In [29]:
lr_pred_test = model.predict(X_test)
print('Test Accuracy :-'),accuracy_score(lr_pred_test,y_test)*100

Test Accuracy :-


(None, 94.67664795175712)

In [34]:
# Prediction System

input_data = X_test[1234]
prediction = model.predict(input_data)
if prediction[0] == 1:
  print('The News Is Fake')
else:
  print('The News Is Real')

The News Is Real
