In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('WELFake_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
df.shape

(72134, 4)

In [4]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.shape

(71537, 4)

In [7]:
df.reset_index(inplace=True)

In [8]:
df.head()

Unnamed: 0.1,index,Unnamed: 0,title,text,label
0,0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

corpus=[]
stopwords_set = stopwords.words('english')
ps = PorterStemmer()
lemmatize = WordNetLemmatizer()

### by using stemming

In [10]:

for i in range(len(df)):
    text = re.sub('[^a-zA-Z]',' ',df['title'][i])
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)

### Now we will use bag of words

In [14]:
cv = CountVectorizer(max_features=10000,ngram_range=(1,4))
# here max_feature we will take all those 10000 words most occuring words as a feature
# ngram_range take combination of 1 word 2 word 3 word and 4 words

X = cv.fit_transform(corpus).toarray()

In [13]:
X.shape

(71537, 10000)

In [15]:
y =df['label']
y

0        1
1        1
2        0
3        1
4        1
        ..
71532    0
71533    1
71534    0
71535    0
71536    1
Name: label, Length: 71537, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [21]:
## just seeing the top 50 feature that we created
cv.get_feature_names()[:50]

['abadi',
 'abandon',
 'abba',
 'abc',
 'abc news',
 'abduct',
 'abdullah',
 'abe',
 'abedin',
 'abil',
 'abl',
 'aboard',
 'abort',
 'abort law',
 'abort video',
 'abroad',
 'abruptli',
 'absolut',
 'absolut destroy',
 'absurd',
 'abu',
 'abus',
 'academ',
 'academi',
 'acceler',
 'accent',
 'accept',
 'accept elect',
 'accept elect result',
 'accept trump',
 'access',
 'access pipelin',
 'access pipelin protest',
 'accid',
 'accident',
 'accomplish',
 'accord',
 'account',
 'accur',
 'accus',
 'accus obama',
 'accus sexual',
 'accus trump',
 'achiev',
 'acid',
 'acknowledg',
 'aclu',
 'acosta',
 'acquit',
 'acquitt']

In [22]:
# here we can understand what are the parameters use here
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 10000,
 'min_df': 1,
 'ngram_range': (1, 4),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [23]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [25]:
from sklearn.metrics import confusion_matrix,accuracy_score
classifier.fit(X_train,Y_train)
pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test,pred)
print(accuracy)
cn  = confusion_matrix(Y_test,pred)

0.8777187587363713


## apply hyper parameter tunning

In [26]:
import numpy as np

In [27]:
new=0
classifier = MultinomialNB(alpha=0.1)
for alpha in np.arange(0,1,0.1):
    parameter = MultinomialNB(alpha=alpha)
    parameter.fit(X_train,Y_train)
    y_pred = parameter.predict(X_test)
    accuracy  = accuracy_score(Y_test,y_pred)
    if accuracy> new:
        classifier = parameter
    print("ALPHA : {}, Score {}".format(alpha,accuracy))



ALPHA : 0.0, Score 0.8748672071568353
ALPHA : 0.1, Score 0.8771037181996086
ALPHA : 0.2, Score 0.8772714565278166
ALPHA : 0.30000000000000004, Score 0.8775510204081632
ALPHA : 0.4, Score 0.8780542353927873
ALPHA : 0.5, Score 0.8784456248252726
ALPHA : 0.6000000000000001, Score 0.8782778864970646
ALPHA : 0.7000000000000001, Score 0.8782778864970646
ALPHA : 0.8, Score 0.8781101481688566
ALPHA : 0.9, Score 0.8778864970645792


- here we can understand that the alpha value is around 0.5 gave us the best result

In [31]:
classifier.coef_[0]
# here the most negative word represent the most fake word

array([-12.74047156,  -9.34298475, -11.99325715, ..., -10.34257628,
        -9.80779742, -12.74047156])