# Generate features from text and use Multinomial Naive Bayes to predict fake news

Reference
* https://github.com/justmarkham/pycon-2016-tutorial/blob/master/tutorial_with_output.ipynb
* https://www.youtube.com/watch?v=hXNbFNCgPfY

## Generate data

In [1]:
import pandas as pd
import numpy as np
import os
import newspaper 

### read data from files and merge them

In [2]:
path = os.path.join('data', 'fakenews_jz.csv')
data_fakenews = pd.read_csv(path,usecols=[1,2,3,5])
data_fakenews['label_num'] = 1
data_fakenews.tail()

Unnamed: 0,url,source,title,text,label_num
435,http://now8news.com/fidget-spinner-bursts-flam...,now8news,Fidget Spinner Bursts Into Flames Killing Todd...,The parents of a 3 year old girl woke up to tr...,1
436,http://now8news.com/18-year-old-girl-marries-f...,now8news,18 Year Old Girl Marries Her Father In Arkansa...,18 Year Old Girl Marries Her Father In Arkansa...,1
437,http://now8news.com/trump-raising-age-limit-to...,now8news,Trump Raising Age Limit For Tobacco Consumptio...,There is more bad news for cigarette smokers –...,1
438,http://now8news.com/caitlyn-jenner-discusses-d...,now8news,Caitlyn Jenner Discusses Her Desire To Transit...,Caitlyn Jenner or “CJ” as he refers to herself...,1
439,http://now8news.com/3-year-old-dies-tickled-de...,now8news,3 Year Old Girl Dies After Accidentally Being ...,"Charlotte, NC – It’s a warning being sent out ...",1


In [5]:
data_fakenews.source.value_counts()

In [127]:
path = os.path.join('data', 'realnews_jz.csv')
data_realnews = pd.read_csv(path,usecols=[0,1,2,4])
data_realnews['label_num'] = 0
data_realnews.tail()

Unnamed: 0,url,source,title,text,label_num
831,http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",,0
832,http://www.newyorker.com/cartoon/dernavich-201...,newyorker,A Cartoon from The New Yorker,,0
833,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,Yusef Komunyakaa reads a poem by Marilyn Hacke...,0
834,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,"The Startup to End All Startups\n\nMeet uBox, ...",0
835,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,WASHINGTON ( The Borowitz Report )—Acknowledgi...,0


In [149]:
path = os.path.join('data', 'hyesoo_df.csv')
data_hy = pd.read_csv(path,usecols=[1,2,3,5,6])
data_hy['label_num'] = data_hy.authenticity.map({1:0,0:1})
data_hy.head()

Unnamed: 0,url,source,title,text,authenticity,label_num
0,http://www.npr.org/2017/07/27/539825446/june-f...,npr,"June Foray, Voice Of Rocky From 'The Bullwinkl...","June Foray, Voice Of Rocky From 'The Bullwinkl...",1,0
1,http://beforeitsnews.com/science-and-technolog...,beforeitsnews,Hacker Proves Anyone Can Fire a Locked Smart G...,Hacker Proves Anyone Can Fire a Locked Smart G...,0,1
2,http://www.bbc.com/news/uk-40731164,bbc,Diesel and petrol car ban: Clean air strategy ...,The government's £3bn clean air strategy does ...,1,0
3,http://beforeitsnews.com/alternative/2017/07/n...,beforeitsnews,NASA Caught Hiding Something At North Pole!,NASA Caught Hiding Something At North Pole!\n\...,0,1
4,https://conservativedailypost.com/planned-pare...,Conservativedailypost,Planned Parenthood Doctors Caught On Tape Laug...,Advertisement\n\nWARNING: The video and conten...,0,1


In [150]:
data_hy.shape

(1741, 6)

In [151]:
data_hy.drop(['authenticity'], axis = 1, inplace = True)
data_hy.head()

Unnamed: 0,url,source,title,text,label_num
0,http://www.npr.org/2017/07/27/539825446/june-f...,npr,"June Foray, Voice Of Rocky From 'The Bullwinkl...","June Foray, Voice Of Rocky From 'The Bullwinkl...",0
1,http://beforeitsnews.com/science-and-technolog...,beforeitsnews,Hacker Proves Anyone Can Fire a Locked Smart G...,Hacker Proves Anyone Can Fire a Locked Smart G...,1
2,http://www.bbc.com/news/uk-40731164,bbc,Diesel and petrol car ban: Clean air strategy ...,The government's £3bn clean air strategy does ...,0
3,http://beforeitsnews.com/alternative/2017/07/n...,beforeitsnews,NASA Caught Hiding Something At North Pole!,NASA Caught Hiding Something At North Pole!\n\...,1
4,https://conservativedailypost.com/planned-pare...,Conservativedailypost,Planned Parenthood Doctors Caught On Tape Laug...,Advertisement\n\nWARNING: The video and conten...,1


In [152]:
news_data = pd.concat([data_fakenews, data_realnews,data_hy], ignore_index=True)
news_data.shape

(3017, 5)

## Convert text to numbers (features)

In [153]:
# shape of our data
news_data.shape

(3017, 5)

In [154]:
# drop rows that contain NaN
news_data = news_data.dropna(axis=0,how='any')    #to drop if any value in the row has a nan
#news_data.tail()

In [155]:
news_data = news_data[news_data.text.map(len) > 200]
news_data.shape

(2796, 5)

In [156]:
# source distribution 
news_data.label_num.value_counts()

0    1740
1    1056
Name: label_num, dtype: int64

### Define x and y for modeling later, and split data into training and testing sets

In [10]:
from sklearn.cross_validation import train_test_split

In [157]:
x = news_data.text
y = news_data.label_num
print(x.shape)
print(y.shape)

(2796,)
(2796,)


In [158]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2097,)
(699,)
(2097,)
(699,)


### Method 1: Use CountVectorizer to generate features

In [202]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

In [203]:
# instantiate the vectorizer
vect = CountVectorizer(stop_words='english',ngram_range=(2, 2))

In [205]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [206]:
x_train_dtm = vect.transform(x_train)
# examine the document-term matrix
x_train_dtm

<2097x561304 sparse matrix of type '<class 'numpy.int64'>'
	with 689438 stored elements in Compressed Sparse Row format>

In [207]:
# transform testing data (using fitted vocabulary) into a document-term matrix
x_test_dtm = vect.transform(x_test)
x_test_dtm

<699x561304 sparse matrix of type '<class 'numpy.int64'>'
	with 63230 stored elements in Compressed Sparse Row format>

### Method 2: use TF-IDF to generate features

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(2, 2))
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
x_train_dtm = tfidf.fit_transform(x_train)

# examine the document-term matrix
x_train_dtm

<791x185241 sparse matrix of type '<class 'numpy.float64'>'
	with 209882 stored elements in Compressed Sparse Row format>

In [36]:
x_test_dtm = tfidf.transform(x_test)
x_test_dtm

<264x185241 sparse matrix of type '<class 'numpy.float64'>'
	with 14664 stored elements in Compressed Sparse Row format>

## Predicting fake news using Multinomial Naive Bayes 

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [213]:
nb = MultinomialNB()

# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(x_train_dtm, y_train)

CPU times: user 29.8 ms, sys: 17.4 ms, total: 47.2 ms
Wall time: 45.3 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [214]:
# make class predictions for x_test_dtm
y_pred_class = nb.predict(x_test_dtm)

# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred_class) # or nb.score(x_test_dtm,y_test)

0.87553648068669532

In [215]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[419,  21],
       [ 66, 193]])

In [166]:
# print the false positives (real news incorrectly classified as fake)
x_test[y_test < y_pred_class]

1747    By Samuel Chamberlain | Fox News\n\nIn an excl...
1127    I think you're right, Wu. They've got to grips...
2268    When President Donald Trump went onstage in Yo...
1335    FACT CHECK: Newt Gingrich's Scathing Critique ...
2678    (CNN) House intelligence committee members on ...
797     Simone Biles isn't afraid of making fun of her...
1406    An Obama official made “hundreds of unmasking ...
1200    WASHINGTON ( The Borowitz Report )—In a dramat...
938     Marvel is hoping fans are ready to get "Hooked...
1365    (CNN) Just because it's on the internet doesn'...
766     A day after Speaker Paul Ryan announced that h...
2222    Fox News\n\nAn Obama official made “hundreds o...
1344    Smoke billows from the ruined Grand al-Nuri Mo...
1098    Sieben zu eins.\n\n\n\nSeven one.\n\n\n\nNo ma...
988     Marco Rubio Talks Super Tuesday, Lists States ...
1555    Washington (CNN) As President Donald Trump con...
2168    Members of the Iranian revolutionary guard mar...
2260    Attorn

In [98]:
# example false positives
#x_test[1747]

In [191]:
# print the false negatives (fake news incorrectly classified as real)
x_test[y_test > y_pred_class]

2470    Robert Mueller selects veteran Justice Departm...
31      The Jewish media was very pissed off over Dona...
123     Time and time again, Muslims continue to prove...
324     You have to LOVE dogs!\n\nI love dogs and this...
1879    • DEA forced to pay Daniel Chong $4.1M after l...
1431    “ Muller has behaved in an extremely unprofess...
1310    (Before It's News)\n\nIn a first-of-its-kind a...
1968    Florida Attorney Proves Cell Phones Cause Brai...
18      Teacher Breaks Down in Tears When She Sees Wha...
1603    Anthony Scaramucci, incoming White House commu...
2845    Once again the press is wrong.\n\nOnce again t...
1946    Advertisement\n\nIn January of 1920, Babe Ruth...
4       Nine-speed auto rated 31/47 mpg\n\nThe 2017 Ch...
1767    The Most Precious Metals Bullish I’ve Ever Bee...
43      In Poland, groups of pro-EU Communists have hi...
2734    ( Breitbart ) The White House’s new communicat...
174     Music publishing is one of the highest income ...
275     The fa

In [100]:
# example false negative
# x_test[2096]

In [192]:
# calculate predicted probabilities for x_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.91735696735696759

## Predicting fake news using logistic regression

In [208]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [209]:
# train the model using X_train_dtm
%time logreg.fit(x_train_dtm, y_train)

CPU times: user 1.08 s, sys: 15.1 ms, total: 1.1 s
Wall time: 1.1 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [212]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(x_test_dtm)

# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.83118741058655221