In [1]:
import env
import pandas as pd
import numpy as np
import utilities as utils

from wrangle import wrangle_articles
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import export_graphviz

In [2]:
# Run on first time or to renew csv files
articles_df = wrangle_articles()

articles_df.to_csv("articles.csv", index_label='index')

NaN values
Empty DataFrame
Columns: [rows, columns]
Index: []
--------------------------------
Empty values
       rows  columns
0       120        1
1       120        6
2       122        1
3       122        6
4       152        1
...     ...      ...
1258  25152        6
1259  25701        1
1260  25701        6
1261  25746        1
1262  25746        6

[1263 rows x 2 columns]
--------------------------------


In [3]:
articles_df.head(5)

Unnamed: 0,title,text,subject,date,is_fake,clean_title,clean_text
37302,FLASHBACK: KING OBAMA COMMUTES SENTENCES OF 22...,Just making room for Hillary President Obama t...,politics,2015-03-31,True,flashback king obama commute sentence 22 drug ...,making room hillary president obama today anno...
37303,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,The gay mafia has a new corporate Don. This i...,politics,2015-03-31,True,apple ceo say religious freedom law dangerous ...,gay mafia new corporate article need read shee...
37304,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,"In case you missed it Sen. Harry Reid (R-NV), ...",politics,2015-03-31,True,watch dirty harry reid lie romneys tax didnt win,case missed sen harry reid rnv announced last ...
37305,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,Nothing like political cronyism to make your s...,politics,2015-03-31,True,oh guess funded shrine ted kennedy,nothing like political cronyism make stomach c...
37306,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,Does anyone really think Hillary Clinton will ...,politics,2015-03-31,True,benghazi panel call hillary testify oath white...,anyone really think hillary clinton come clean...


In [25]:
articles_df.shape

(38470, 7)

In [4]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(articles_df.clean_text)
y = articles_df.is_fake

In [5]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=.2)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=.3)

In [6]:
X_train.shape, X_validate.shape, X_test.shape

((21543, 216476), (9233, 216476), (7694, 216476))

In [7]:
y_train.shape, y_validate.shape, y_test.shape

((21543,), (9233,), (7694,))

In [8]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

# Establish baseline model

In [9]:
train['baseline_predicted'] = True
validate['baseline_predicted'] = True
test['baseline_predicted'] = True

In [10]:
utils.print_model_evaluation(train, 'baseline_predicted')

Accuracy: 46.10%
---
Confusion Matrix
actual              False  True 
baseline_predicted              
True                11611   9932
---
              precision    recall  f1-score   support

       False       0.00      0.00      0.00     11611
        True       0.46      1.00      0.63      9932

    accuracy                           0.46     21543
   macro avg       0.23      0.50      0.32     21543
weighted avg       0.21      0.46      0.29     21543



Baseline accuracy of 52.7% using True as default prediction for is_fake

# Logistic Regression

In [11]:
lm = LogisticRegression().fit(X_train, y_train)

train['log_predicted'] = lm.predict(X_train)
validate['log_predicted'] = lm.predict(X_validate)

In [12]:
utils.print_model_evaluation(train, 'log_predicted')

Accuracy: 98.70%
---
Confusion Matrix
actual         False  True 
log_predicted              
False          11509    177
True             102   9755
---
              precision    recall  f1-score   support

       False       0.98      0.99      0.99     11611
        True       0.99      0.98      0.99      9932

    accuracy                           0.99     21543
   macro avg       0.99      0.99      0.99     21543
weighted avg       0.99      0.99      0.99     21543



In [13]:
utils.print_model_evaluation(validate, 'log_predicted')

Accuracy: 97.80%
---
Confusion Matrix
actual         False  True 
log_predicted              
False           4901    128
True              75   4129
---
              precision    recall  f1-score   support

       False       0.97      0.98      0.98      4976
        True       0.98      0.97      0.98      4257

    accuracy                           0.98      9233
   macro avg       0.98      0.98      0.98      9233
weighted avg       0.98      0.98      0.98      9233



# Decision Tree

In [14]:
dcn_tree = DecisionTreeClassifier(max_depth=7, random_state=1414)
dcn_tree = dcn_tree.fit(X_train, y_train)

train['dcn_tree_predicted'] = dcn_tree.predict(X_train)
validate['dcn_tree_predicted'] = dcn_tree.predict(X_validate)

In [15]:
utils.print_model_evaluation(train, 'dcn_tree_predicted')

Accuracy: 92.23%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False               10965   1027
True                  646   8905
---
              precision    recall  f1-score   support

       False       0.91      0.94      0.93     11611
        True       0.93      0.90      0.91      9932

    accuracy                           0.92     21543
   macro avg       0.92      0.92      0.92     21543
weighted avg       0.92      0.92      0.92     21543



In [16]:
utils.print_model_evaluation(validate, 'dcn_tree_predicted')

Accuracy: 90.75%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False                4622    500
True                  354   3757
---
              precision    recall  f1-score   support

       False       0.90      0.93      0.92      4976
        True       0.91      0.88      0.90      4257

    accuracy                           0.91      9233
   macro avg       0.91      0.91      0.91      9233
weighted avg       0.91      0.91      0.91      9233



# Random Forest

In [17]:
rand_forest = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion='gini', min_samples_leaf=3, \
                                     n_estimators=100, max_depth=15, random_state=1414)
rand_forest = rand_forest.fit(X_train, y_train)

train['rand_forest_predicted'] = rand_forest.predict(X_train)
validate['rand_forest_predicted'] = rand_forest.predict(X_validate)

In [18]:
utils.print_model_evaluation(train, 'rand_forest_predicted')

Accuracy: 96.97%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                  11374    415
True                     237   9517
---
              precision    recall  f1-score   support

       False       0.96      0.98      0.97     11611
        True       0.98      0.96      0.97      9932

    accuracy                           0.97     21543
   macro avg       0.97      0.97      0.97     21543
weighted avg       0.97      0.97      0.97     21543



In [19]:
utils.print_model_evaluation(validate, 'rand_forest_predicted')

Accuracy: 95.52%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                   4830    268
True                     146   3989
---
              precision    recall  f1-score   support

       False       0.95      0.97      0.96      4976
        True       0.96      0.94      0.95      4257

    accuracy                           0.96      9233
   macro avg       0.96      0.95      0.95      9233
weighted avg       0.96      0.96      0.96      9233



# K Nearest Neighbors

In [20]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn = knn.fit(X_train, y_train)

train['knn_predicted'] = knn.predict(X_train)
validate['knn_predicted'] = knn.predict(X_validate)

In [21]:
utils.print_model_evaluation(train, 'knn_predicted')

Accuracy: 61.25%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False           3312     48
True            8299   9884
---
              precision    recall  f1-score   support

       False       0.99      0.29      0.44     11611
        True       0.54      1.00      0.70      9932

    accuracy                           0.61     21543
   macro avg       0.76      0.64      0.57     21543
weighted avg       0.78      0.61      0.56     21543



In [22]:
utils.print_model_evaluation(validate, 'knn_predicted')

Accuracy: 56.75%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False           1004     21
True            3972   4236
---
              precision    recall  f1-score   support

       False       0.98      0.20      0.33      4976
        True       0.52      1.00      0.68      4257

    accuracy                           0.57      9233
   macro avg       0.75      0.60      0.51      9233
weighted avg       0.77      0.57      0.49      9233



# MVP

In [23]:
test['log_predicted'] = lm.predict(X_test)

In [24]:
utils.print_model_evaluation(test, 'log_predicted')

Accuracy: 97.70%
---
Confusion Matrix
actual         False  True 
log_predicted              
False           4080    110
True              67   3437
---
              precision    recall  f1-score   support

       False       0.97      0.98      0.98      4147
        True       0.98      0.97      0.97      3547

    accuracy                           0.98      7694
   macro avg       0.98      0.98      0.98      7694
weighted avg       0.98      0.98      0.98      7694

