In [73]:
import env
import pandas as pd
import numpy as np
import utilities as utils

from wrangle import wrangle_articles
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import export_graphviz

In [2]:
# Run on first time or to renew csv files
all_articles_df, only_articles_with_text_df = wrangle_articles()

all_articles_df.to_csv("all_articles.csv", index_label='index')
only_articles_with_text_df.to_csv("only_articles_with_text.csv", index_label='index')

NaN values
Empty DataFrame
Columns: [rows, columns]
Index: []
--------------------------------
Empty values
       rows  columns
0       120        1
1       120        6
2       122        1
3       122        6
4       152        1
...     ...      ...
1258  25152        6
1259  25701        1
1260  25701        6
1261  25746        1
1262  25746        6

[1263 rows x 2 columns]
--------------------------------


In [3]:
all_articles_df.head(5)

Unnamed: 0,title,text,subject,date,is_fake,clean_title,clean_text
37302,FLASHBACK: KING OBAMA COMMUTES SENTENCES OF 22...,Just making room for Hillary President Obama t...,politics,2015-03-31,True,flashback king obama commute sentence 22 drug ...,making room hillary president obama today anno...
37303,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,The gay mafia has a new corporate Don. This i...,politics,2015-03-31,True,apple ceo say religious freedom law dangerous ...,gay mafia new corporate article need read shee...
37304,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,"In case you missed it Sen. Harry Reid (R-NV), ...",politics,2015-03-31,True,watch dirty harry reid lie romneys tax didnt win,case missed sen harry reid rnv announced last ...
37305,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,Nothing like political cronyism to make your s...,politics,2015-03-31,True,oh guess funded shrine ted kennedy,nothing like political cronyism make stomach c...
37306,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,Does anyone really think Hillary Clinton will ...,politics,2015-03-31,True,benghazi panel call hillary testify oath white...,anyone really think hillary clinton come clean...


In [8]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(only_articles_with_text_df.clean_text)
y = only_articles_with_text_df.is_fake

In [9]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=.2)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=.3)

In [10]:
X_train.shape, X_validate.shape, X_test.shape

((24782, 216476), (10622, 216476), (8852, 216476))

In [11]:
y_train.shape, y_validate.shape, y_test.shape

((24782,), (10622,), (8852,))

In [12]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

# Establish baseline model

In [13]:
train['baseline_predicted'] = True
validate['baseline_predicted'] = True
test['baseline_predicted'] = True

In [18]:
utils.print_model_evaluation(train, 'baseline_predicted')

Accuracy: 52.66%
---
Confusion Matrix
actual              False  True 
baseline_predicted              
True                11732  13050
---
              precision    recall  f1-score   support

       False       0.00      0.00      0.00     11732
        True       0.53      1.00      0.69     13050

    accuracy                           0.53     24782
   macro avg       0.26      0.50      0.34     24782
weighted avg       0.28      0.53      0.36     24782



Baseline accuracy of 52.7% using True as default prediction for is_fake

# Logistic Regression

In [119]:
lm = LogisticRegression().fit(X_train, y_train)

train['log_predicted'] = lm.predict(X_train)
validate['log_predicted'] = lm.predict(X_validate)

In [120]:
utils.print_model_evaluation(train, 'log_predicted')

Accuracy: 98.69%
---
Confusion Matrix
actual         False  True 
log_predicted              
False          11581    173
True             151  12877
---
              precision    recall  f1-score   support

       False       0.99      0.99      0.99     11732
        True       0.99      0.99      0.99     13050

    accuracy                           0.99     24782
   macro avg       0.99      0.99      0.99     24782
weighted avg       0.99      0.99      0.99     24782



In [121]:
utils.print_model_evaluation(validate, 'log_predicted')

Accuracy: 97.89%
---
Confusion Matrix
actual         False  True 
log_predicted              
False           4930    126
True              98   5468
---
              precision    recall  f1-score   support

       False       0.98      0.98      0.98      5028
        True       0.98      0.98      0.98      5594

    accuracy                           0.98     10622
   macro avg       0.98      0.98      0.98     10622
weighted avg       0.98      0.98      0.98     10622



# Decision Tree

In [45]:
dcn_tree = DecisionTreeClassifier(max_depth=7, random_state=1414)
dcn_tree = dcn_tree.fit(X_train, y_train)

train['dcn_tree_predicted'] = dcn_tree.predict(X_train)
validate['dcn_tree_predicted'] = dcn_tree.predict(X_validate)

In [46]:
utils.print_model_evaluation(train, 'dcn_tree_predicted')

Accuracy: 90.96%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False               10755   1263
True                  977  11787
---
              precision    recall  f1-score   support

       False       0.89      0.92      0.91     11732
        True       0.92      0.90      0.91     13050

    accuracy                           0.91     24782
   macro avg       0.91      0.91      0.91     24782
weighted avg       0.91      0.91      0.91     24782



In [47]:
utils.print_model_evaluation(validate, 'dcn_tree_predicted')

Accuracy: 89.85%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False                4525    575
True                  503   5019
---
              precision    recall  f1-score   support

       False       0.89      0.90      0.89      5028
        True       0.91      0.90      0.90      5594

    accuracy                           0.90     10622
   macro avg       0.90      0.90      0.90     10622
weighted avg       0.90      0.90      0.90     10622



# Random Forest

In [95]:
rand_forest = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion='gini', min_samples_leaf=3, \
                                     n_estimators=100, max_depth=15, random_state=1414)
rand_forest = rand_forest.fit(X_train, y_train)

train['rand_forest_predicted'] = rand_forest.predict(X_train)
validate['rand_forest_predicted'] = rand_forest.predict(X_validate)

In [96]:
utils.print_model_evaluation(train, 'rand_forest_predicted')

Accuracy: 96.92%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                  11429    460
True                     303  12590
---
              precision    recall  f1-score   support

       False       0.96      0.97      0.97     11732
        True       0.98      0.96      0.97     13050

    accuracy                           0.97     24782
   macro avg       0.97      0.97      0.97     24782
weighted avg       0.97      0.97      0.97     24782



In [97]:
utils.print_model_evaluation(validate, 'rand_forest_predicted')

Accuracy: 95.59%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                   4859    299
True                     169   5295
---
              precision    recall  f1-score   support

       False       0.94      0.97      0.95      5028
        True       0.97      0.95      0.96      5594

    accuracy                           0.96     10622
   macro avg       0.96      0.96      0.96     10622
weighted avg       0.96      0.96      0.96     10622



# K Nearest Neighbors

In [113]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn = knn.fit(X_train, y_train)

train['knn_predicted'] = knn.predict(X_train)
validate['knn_predicted'] = knn.predict(X_validate)

In [114]:
utils.print_model_evaluation(train, 'knn_predicted')

Accuracy: 66.06%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False           3386     64
True            8346  12986
---
              precision    recall  f1-score   support

       False       0.98      0.29      0.45     11732
        True       0.61      1.00      0.76     13050

    accuracy                           0.66     24782
   macro avg       0.80      0.64      0.60     24782
weighted avg       0.79      0.66      0.61     24782



In [115]:
utils.print_model_evaluation(validate, 'knn_predicted')

Accuracy: 62.43%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False           1067     30
True            3961   5564
---
              precision    recall  f1-score   support

       False       0.97      0.21      0.35      5028
        True       0.58      0.99      0.74      5594

    accuracy                           0.62     10622
   macro avg       0.78      0.60      0.54     10622
weighted avg       0.77      0.62      0.55     10622

