In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('datasets/fakeNewsTrainData.csv')

In [3]:
dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
features = dataset.drop('label',axis = 1)
label = dataset['label']

In [5]:
features.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [7]:
dataset = dataset.dropna()

In [8]:
messages = dataset.copy()

In [9]:
messages.reset_index(inplace = True)

In [10]:
messages['title'][6]

'Benoît Hamon Wins French Socialist Party’s Presidential Nomination - The New York Times'

In [11]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]', ' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, ngram_range = (1,3))
X = cv.fit_transform(corpus).toarray()
X.shape

(18285, 5000)

In [13]:
y = messages['label']

In [14]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X,y,test_size = 0.33)

In [15]:
cv.get_feature_names()[:20]

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut',
 'abstain',
 'absurd',
 'abus',
 'abus new',
 'abus new york',
 'academi',
 'accept',
 'access',
 'access pipelin',
 'access pipelin protest']

In [16]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [17]:
count_df = pd.DataFrame(xTrain, columns = cv.get_feature_names())
count_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
from sklearn import metrics
import numpy as np
import itertools

In [19]:
classifier.fit(xTrain, yTrain)
prediction = classifier.predict(xTest)
score = metrics.accuracy_score(yTest, prediction)
print('accuracy: ',score)
cm = metrics.confusion_matrix(yTest, prediction)
cm

accuracy:  0.8941176470588236


array([[3071,  380],
       [ 259, 2325]])

# Passive Aggressive Classifier Algorithm
## Works better with text data

In [20]:
from sklearn.linear_model import PassiveAggressiveClassifier
linearclassifier = PassiveAggressiveClassifier()
linearclassifier.fit(xTrain, yTrain)
prediction = linearclassifier.predict(xTest)
score = metrics.accuracy_score(yTest, prediction)
print('accuracy: ',score)
cm = metrics.confusion_matrix(yTest, prediction)
cm

accuracy:  0.9140016570008285


array([[3181,  270],
       [ 249, 2335]])

In [21]:
classifier = MultinomialNB(alpha = 0.1)

In [23]:
previos_score = 0
for alpha in np.arange(0,1,0.1):
    sub_classifier = MultinomialNB(alpha = alpha)
    sub_classifier.fit(xTrain, yTrain)
    prediction = sub_classifier.predict(xTest)
    score = metrics.accuracy_score(yTest, prediction)
    if score > previos_score:
        classifier = sub_classifier
    print('Alpha:{}, Score: {}'.format(alpha,score))



Alpha:0.0, Score: 0.8773819386909694
Alpha:0.1, Score: 0.891963545981773
Alpha:0.2, Score: 0.891963545981773
Alpha:0.30000000000000004, Score: 0.8924606462303231
Alpha:0.4, Score: 0.8934548467274234
Alpha:0.5, Score: 0.8936205468102734
Alpha:0.6000000000000001, Score: 0.8932891466445734
Alpha:0.7000000000000001, Score: 0.8929577464788733
Alpha:0.8, Score: 0.8942833471416736
Alpha:0.9, Score: 0.8942833471416736


In [24]:
feature_names = cv.get_feature_names()
classifier.coef_[0]

array([ -9.33552614,  -8.51045142,  -9.33552614, ..., -10.80186321,
        -9.10726749,  -9.33552614])

In [28]:
#Most real values will be more near to zero
sorted(zip(classifier.coef_[0],feature_names), reverse = True)[:20]

[(-3.9703892011513107, 'trump'),
 (-4.278300901752564, 'hillari'),
 (-4.3795185272051915, 'clinton'),
 (-4.85615177480603, 'elect'),
 (-5.212120782623424, 'new'),
 (-5.2545179758838, 'video'),
 (-5.263217465285729, 'comment'),
 (-5.317066274411421, 'us'),
 (-5.359445497380283, 'war'),
 (-5.439528289230596, 'fbi'),
 (-5.444752961512548, 'hillari clinton'),
 (-5.46059278582483, 'vote'),
 (-5.5041015512877625, 'email'),
 (-5.585117495047851, 'world'),
 (-5.597246206494466, 'obama'),
 (-5.706750609064418, 'donald'),
 (-5.755574810572814, 'donald trump'),
 (-5.822069019328396, 'russia'),
 (-5.837465782299108, 'say'),
 (-5.845253983659453, 'presid')]

In [29]:
tfidf = TfidfVectorizer(max_features = 5000, ngram_range = (1,3))
X = tfidf.fit_transform(corpus).toarray()
X.shape

(18285, 5000)

In [30]:
y = messages['label']

In [31]:
tfidf.get_feature_names()[:20]

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut',
 'abstain',
 'absurd',
 'abus',
 'abus new',
 'abus new york',
 'academi',
 'accept',
 'access',
 'access pipelin',
 'access pipelin protest']

In [32]:
tfidf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [33]:
count_df = pd.DataFrame(xTrain,columns = tfidf.get_feature_names())
count_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
