# Generate features from text and use Multinomial Naive Bayes to predict fake news

Reference
* https://github.com/justmarkham/pycon-2016-tutorial/blob/master/tutorial_with_output.ipynb
* https://www.youtube.com/watch?v=hXNbFNCgPfY

## Generate data

In [2]:
import pandas as pd
import numpy as np
import os
import newspaper 

### read data from files

In [3]:
path = os.path.join('data','merged_data.csv')

news_data = pd.read_csv(path,usecols=[1,2,3,4,5,6]) 
news_data['label_num'] = news_data.authenticity.map({1:0,0:1})

news_data.tail()

Unnamed: 0,url,source,title,author,text,authenticity,label_num
2558,https://www.ice.gov/news/releases/operation-ma...,politico,Operation Matador nets 39 MS-13 arrests in las...,[],NEW YORK – U.S. Immigration and Customs Enforc...,1,0
2559,http://www.politico.com/story/2017/07/27/obama...,politico,Senate Republicans prepare to pass Obamacare r...,"['John Bresnahan', 'Burgess Everett', 'Jennife...",Senate Republicans are closing in on passage o...,1,0
2560,https://www.nytimes.com/2017/07/26/technology/...,nytimes,Google’s New Parental Control App Has a Flaw: ...,"['Brian X. Chen', 'Tech Fix']",“The fact that the kid can graduate themselves...,1,0
2561,http://www.foxnews.com/entertainment/2017/07/2...,foxnews,Hulu resurrects TGIF lineup with acquisition o...,['Tyler Mccarthy'],Hulu is hoping to make itself the go-to stream...,1,0
2562,http://www.npr.org/2017/07/27/539559582/5-unan...,npr,5 Unanswered Questions About Trump's 'Ban' On ...,['Philip Ewing'],5 Unanswered Questions About Trump's 'Ban' On ...,1,0


In [None]:
# drop a column
#news_data.drop(['authenticity'], axis = 1, inplace = True)
#news_data.head()

In [6]:
news_data.label_num.value_counts()

0    1566
1     997
Name: label_num, dtype: int64

### balance real and fakenews

#### sample real news
number of real news ~= number of fake news

In [117]:
data_balanced_realnews = news_data[news_data.label_num==0]
data_balanced_realnews = data_balanced_realnews.sample(frac=0.64)
data_balanced_realnews.shape

(1002, 7)

### get all fake news

In [118]:
data_balanced_fakenews = news_data[news_data.label_num==1]
data_balanced_fakenews.shape

(997, 7)

#### combine real and fake news

In [91]:
data_balanced = pd.concat([data_balanced_realnews, data_balanced_fakenews], ignore_index=True)
data_balanced.tail()

Unnamed: 0,url,source,title,author,text,authenticity,label_num
1994,http://www.bighairynews.com/2017/07/military-t...,Bighairynews,Military Trannies Trumped,[],WASHINGTON (World News Bureau) - President Tru...,0,1
1995,http://bipartisanreport.com/2017/07/27/scaramu...,bipartisanreport,Scaramucci Responds To Article About His Profa...,['Holly Lee'],Anthony Scaramucci is facing serious scorn aft...,0,1
1996,https://amgreatness.com/2017/07/19/muellers-in...,wordpress,Mueller’s Investigation Must Be Limited and Ac...,"['Andrew C. Mccarthy', 'Sam Mcgowan', 'Bill S'...",How much goalpost moving should be tolerable i...,0,1
1997,http://www.politico.com/story/2017/07/26/scara...,wordpress,Scaramucci still stands to profit from SkyBrid...,"['Lorraine Woellert', 'Cristiano Lima', 'Tara ...",The incoming White House communications direct...,0,1
1998,http://beforeitsnews.com/health/2017/07/how-po...,beforeitsnews,How potatoes can increase the risk of cancer c...,['Natural Health'],(Before It's News)\n\n(NaturalHealth365) A rec...,0,1


In [119]:
data_balanced.label_num.value_counts()

0    1002
1     997
Name: label_num, dtype: int64

## Convert text to numbers (features)

### Define x and y for modeling later, and split data into training and testing sets

In [8]:
from sklearn.cross_validation import train_test_split

In [124]:
x = data_balanced.text
y = data_balanced.label_num
print(x.shape)
print(y.shape)

(1999,)
(1999,)


In [125]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1499,)
(500,)
(1499,)
(500,)


In [126]:
y_train.value_counts()

1    752
0    747
Name: label_num, dtype: int64

### customized tokenize function for stemming and removing punctuation
codes are taken from https://stackoverflow.com/questions/26126442/combining-text-stemming-and-removal-of-punctuation-in-nltk-and-scikit-learn

In [127]:
import nltk
import string
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize_stemmer(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

### Method 1: Use CountVectorizer to generate features

In [12]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
# instantiate the vectorizer
vect = CountVectorizer(tokenizer=tokenize_stemmer,stop_words='english',ngram_range=(2, 2))

In [98]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_stemmer at 0x30b9ded08>,
        vocabulary=None)

In [100]:
x_train_dtm = vect.transform(x_train)
# examine the document-term matrix
x_train_dtm

<1499x422215 sparse matrix of type '<class 'numpy.int64'>'
	with 551598 stored elements in Compressed Sparse Row format>

In [17]:
x_train_dtm.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [101]:
# transform testing data (using fitted vocabulary) into a document-term matrix
x_test_dtm = vect.transform(x_test)
x_test_dtm

<500x422215 sparse matrix of type '<class 'numpy.int64'>'
	with 62009 stored elements in Compressed Sparse Row format>

### Method 2: use TF-IDF to generate features

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=tokenize_stemmer,stop_words='english',ngram_range=(2, 2))
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_stemmer at 0x3388e4e18>, use_idf=True,
        vocabulary=None)

In [129]:
x_train_dtm = tfidf.fit_transform(x_train)

# examine the document-term matrix
x_train_dtm

<1499x422215 sparse matrix of type '<class 'numpy.float64'>'
	with 551598 stored elements in Compressed Sparse Row format>

In [130]:
x_test_dtm = tfidf.transform(x_test)
x_test_dtm

<500x422215 sparse matrix of type '<class 'numpy.float64'>'
	with 62009 stored elements in Compressed Sparse Row format>

## Predicting fake news using Multinomial Naive Bayes 

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [112]:
nb = MultinomialNB()

# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(x_train_dtm, y_train)

CPU times: user 22.5 ms, sys: 10.2 ms, total: 32.7 ms
Wall time: 31.5 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [113]:
# make class predictions for x_test_dtm
y_pred_class = nb.predict(x_test_dtm)

# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred_class) # or nb.score(x_test_dtm,y_test)

0.91800000000000004

In [114]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[229,  26],
       [ 15, 230]])

In [None]:
# print the false positives (real news incorrectly classified as fake)
x_test[y_test < y_pred_class]

In [None]:
# example false positives
#x_test[1747]

In [None]:
# print the false negatives (fake news incorrectly classified as real)
x_test[y_test > y_pred_class]

In [None]:
# example false negative
# x_test[2096]

In [None]:
# calculate predicted probabilities for x_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Predicting fake news using logistic regression

In [79]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [131]:
# train the model using X_train_dtm
%time logreg.fit(x_train_dtm, y_train)

CPU times: user 109 ms, sys: 9.42 ms, total: 118 ms
Wall time: 117 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [132]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(x_test_dtm)

# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.91000000000000003