In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [50]:
df = pd.read_csv("C:\\Users\\Eli\\Desktop\\news_dataset_processed.csv")
df

Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,muslim bust they steal million in gov t benefit,print they should pay all the back all the mon...,100percentfedup,fake
1,1,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,100percentfedup,fake
2,2,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,100percentfedup,fake
3,3,pin drop speech by father of daughter kidnap a...,email kayla mueller be a prisoner and torture ...,100percentfedup,fake
4,4,fantastic trump s point plan to reform healthc...,email healthcare reform to make america great ...,100percentfedup,fake
...,...,...,...,...,...
27980,28706,An eavesdropping Uber driver saved his 16-year...,Uber driver Keith Avila picked up a p...,Washington Post,real
27981,28707,Plane carrying six people returning from a Cav...,Crews on Friday continued to search L...,Washington Post,real
27982,28708,After helping a fraction of homeowners expecte...,When the Obama administration announced a...,Washington Post,real
27983,28709,"Yes, this is real: Michigan just banned bannin...",This story has been updated. A new law in...,Washington Post,real


In [93]:
df.head()

Unnamed: 0,title,content,publication,label
0,muslim bust they steal million in gov t benefit,print they should pay all the back all the mon...,100percentfedup,1
1,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,100percentfedup,1
2,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,100percentfedup,1
3,pin drop speech by father of daughter kidnap a...,email kayla mueller be a prisoner and torture ...,100percentfedup,1
4,fantastic trump s point plan to reform healthc...,email healthcare reform to make america great ...,100percentfedup,1


In [51]:
#drop NaN values
df = df.dropna()
#drop 'Unnamed' column which was pre-existing id
df = df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = False)

In [52]:
df

Unnamed: 0,title,content,publication,label
0,muslim bust they steal million in gov t benefit,print they should pay all the back all the mon...,100percentfedup,fake
1,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,100percentfedup,fake
2,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,100percentfedup,fake
3,pin drop speech by father of daughter kidnap a...,email kayla mueller be a prisoner and torture ...,100percentfedup,fake
4,fantastic trump s point plan to reform healthc...,email healthcare reform to make america great ...,100percentfedup,fake
...,...,...,...,...
27980,An eavesdropping Uber driver saved his 16-year...,Uber driver Keith Avila picked up a p...,Washington Post,real
27981,Plane carrying six people returning from a Cav...,Crews on Friday continued to search L...,Washington Post,real
27982,After helping a fraction of homeowners expecte...,When the Obama administration announced a...,Washington Post,real
27983,"Yes, this is real: Michigan just banned bannin...",This story has been updated. A new law in...,Washington Post,real


In [62]:
#change labels to numeric values fake = 1, real = 0
df['label'] = df['label'].map({'fake': 1, 'real': 0})

df

Unnamed: 0,title,content,publication,label
0,muslim bust they steal million in gov t benefit,print they should pay all the back all the mon...,100percentfedup,1
1,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,100percentfedup,1
2,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,100percentfedup,1
3,pin drop speech by father of daughter kidnap a...,email kayla mueller be a prisoner and torture ...,100percentfedup,1
4,fantastic trump s point plan to reform healthc...,email healthcare reform to make america great ...,100percentfedup,1
...,...,...,...,...
27980,An eavesdropping Uber driver saved his 16-year...,Uber driver Keith Avila picked up a p...,Washington Post,0
27981,Plane carrying six people returning from a Cav...,Crews on Friday continued to search L...,Washington Post,0
27982,After helping a fraction of homeowners expecte...,When the Obama administration announced a...,Washington Post,0
27983,"Yes, this is real: Michigan just banned bannin...",This story has been updated. A new law in...,Washington Post,0


In [67]:
X_content_text = df['content'].values
X_title_text = df['title'].values
y = df['label'].values


In [68]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df= 0.85, min_df= 0.01)

In [69]:
X_content_tfidf = tfidf.fit_transform(X_content_text)
X_title_tfidf = tfidf.fit_transform(X_title_text)

In [70]:
indices = df.index.values

In [71]:
X_content_tfidf_train, X_content_tfidf_test, y_content_train, y_content_test, indices_content_train, indices_content_test = train_test_split(X_content_tfidf, y, indices, test_size = 0.2, random_state = 51)

In [72]:
df.loc[indices_content_train].groupby('label').agg('count')

Unnamed: 0_level_0,title,content,publication
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,12588,12588,12588
1,9799,9799,9799


In [73]:
df.loc[indices_content_test].groupby('label').agg('count')

Unnamed: 0_level_0,title,content,publication
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3124,3124,3124
1,2473,2473,2473


In [74]:
from  sklearn.linear_model import LogisticRegression

In [81]:
from sklearn.naive_bayes import MultinomialNB

In [82]:
nb = MultinomialNB()

In [83]:
nb.fit(X_content_tfidf_train, y_content_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [84]:
y_content_train_pred_nb = nb.predict(X_content_tfidf_train)

In [86]:
print('Naive Bayes In Sample F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_content_train, y_content_train_pred_nb, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_content_train, y_content_train_pred)*100))

Naive Bayes In Sample F1 and Accuracy Scores:
F1 score 87.42%
Accuracy score 95.43%


In [87]:

y_content_pred_nb = nb.predict(X_content_tfidf_test)

In [90]:
# print metrics
print('Naive Bayes In Sample F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_content_test, y_content_pred_nb, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_content_test, y_content_pred_nb)*100))

Naive Bayes In Sample F1 and Accuracy Scores:
F1 score 86.02%
Accuracy score 86.35%


In [75]:
lr = LogisticRegression()

In [76]:
lr.fit(X_content_tfidf_train, y_content_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
y_content_train_pred = lr.predict(X_content_tfidf_train)

In [78]:
print('Logistic Regression In Sample F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_content_train, y_content_train_pred, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_content_train, y_content_train_pred)*100))

Logistic Regression In Sample F1 and Accuracy Scores:
F1 score 95.34%
Accuracy score 95.43%


In [91]:
y_content_pred_lr = lr.predict(X_content_tfidf_test)

In [92]:
print('Logistic Regression In Sample F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_content_test, y_content_pred_lr, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_content_test, y_content_pred_lr)*100))

Logistic Regression In Sample F1 and Accuracy Scores:
F1 score 93.62%
Accuracy score 93.73%


In [79]:
np.where(y_content_train != y_content_train_pred)

(array([   23,    51,   132, ..., 22343, 22365, 22384], dtype=int64),)

In [80]:
df.loc[[23, 51]]

Unnamed: 0,title,content,publication,label
23,lol british wife of lib actor who say there wi...,go to article political activist and hillary s...,100percentfedup,1
51,episode sunday wire hail to the deplorables wi...,november by leave a comment episode of sunday ...,21stcenturywire,1


In [None]:
#getting top words

