In [1]:
import env
import pandas as pd
import numpy as np
import utilities as utils

from wrangle import wrangle_articles
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import export_graphviz

In [None]:
# Run on first time or to renew csv files
articles_df = wrangle_articles()

In [3]:
articles_df.to_csv("articles.csv")

In [4]:
articles_df.head(5)

Unnamed: 0,title,text,subject,date,is_fake,clean_title,clean_text,title_polarity,title_subjectivity,text_polarity,text_subjectivity
37302,FLASHBACK: KING OBAMA COMMUTES SENTENCES OF 22...,Just making room for Hillary President Obama t...,politics,2015-03-31,True,flashback king obama commute sentence 22 drug ...,making room hillary president obama today anno...,0.0,0.0,-0.201587,0.493452
37303,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,The gay mafia has a new corporate Don. This i...,politics,2015-03-31,True,apple ceo say religious freedom law dangerous ...,gay mafia new corporate article need read shee...,0.058333,0.579167,-0.040032,0.582057
37304,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,"In case you missed it Sen. Harry Reid (R-NV), ...",politics,2015-03-31,True,watch dirty harry reid lie romneys tax didnt win,case missed sen harry reid rnv announced last ...,0.1,0.6,0.15,0.511111
37305,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,Nothing like political cronyism to make your s...,politics,2015-03-31,True,oh guess funded shrine ted kennedy,nothing like political cronyism make stomach c...,0.0,0.0,0.122865,0.441116
37306,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,Does anyone really think Hillary Clinton will ...,politics,2015-03-31,True,benghazi panel call hillary testify oath white...,anyone really think hillary clinton come clean...,0.0,0.0,0.110586,0.433784


In [5]:
articles_df.shape

(38651, 11)

In [6]:
utils.nan_null_empty_check(articles_df)

NaN values
Empty DataFrame
Columns: [rows, columns]
Index: []
--------------------------------
Empty values
Empty DataFrame
Columns: [rows, columns]
Index: []
--------------------------------


{'nan_positions': (array([], dtype=int64), array([], dtype=int64)),
 'empty_positions': (array([], dtype=int64), array([], dtype=int64))}

In [7]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(articles_df.clean_text)
y = articles_df.is_fake

In [8]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=.2)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=.3)

In [9]:
X_train.shape, X_validate.shape, X_test.shape

((21644, 218347), (9276, 218347), (7731, 218347))

In [10]:
y_train.shape, y_validate.shape, y_test.shape

((21644,), (9276,), (7731,))

In [11]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

# Establish baseline model

In [12]:
articles_df.is_fake.value_counts()

False    21196
True     17455
Name: is_fake, dtype: int64

In [13]:
train['baseline_predicted'] = False
validate['baseline_predicted'] = False
test['baseline_predicted'] = False

In [14]:
utils.print_model_evaluation(train, 'baseline_predicted')

Accuracy: 54.84%
---
Confusion Matrix
actual              False  True 
baseline_predicted              
False               11869   9775
---
              precision    recall  f1-score   support

       False       0.55      1.00      0.71     11869
        True       0.00      0.00      0.00      9775

    accuracy                           0.55     21644
   macro avg       0.27      0.50      0.35     21644
weighted avg       0.30      0.55      0.39     21644



Baseline accuracy of 55% using True as default prediction for is_fake

# Logistic Regression

In [15]:
lm = LogisticRegression().fit(X_train, y_train)

train['log_predicted'] = lm.predict(X_train)
validate['log_predicted'] = lm.predict(X_validate)

In [16]:
utils.print_model_evaluation(train, 'log_predicted')

Accuracy: 98.62%
---
Confusion Matrix
actual         False  True 
log_predicted              
False          11774    203
True              95   9572
---
              precision    recall  f1-score   support

       False       0.98      0.99      0.99     11869
        True       0.99      0.98      0.98      9775

    accuracy                           0.99     21644
   macro avg       0.99      0.99      0.99     21644
weighted avg       0.99      0.99      0.99     21644



In [17]:
utils.print_model_evaluation(validate, 'log_predicted')

Accuracy: 98.02%
---
Confusion Matrix
actual         False  True 
log_predicted              
False           5020    117
True              67   4072
---
              precision    recall  f1-score   support

       False       0.98      0.99      0.98      5087
        True       0.98      0.97      0.98      4189

    accuracy                           0.98      9276
   macro avg       0.98      0.98      0.98      9276
weighted avg       0.98      0.98      0.98      9276



# Decision Tree

In [18]:
dcn_tree = DecisionTreeClassifier(max_depth=7, random_state=1414)
dcn_tree = dcn_tree.fit(X_train, y_train)

train['dcn_tree_predicted'] = dcn_tree.predict(X_train)
validate['dcn_tree_predicted'] = dcn_tree.predict(X_validate)

In [19]:
utils.print_model_evaluation(train, 'dcn_tree_predicted')

Accuracy: 91.49%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False               11208   1180
True                  661   8595
---
              precision    recall  f1-score   support

       False       0.90      0.94      0.92     11869
        True       0.93      0.88      0.90      9775

    accuracy                           0.91     21644
   macro avg       0.92      0.91      0.91     21644
weighted avg       0.92      0.91      0.91     21644



In [20]:
utils.print_model_evaluation(validate, 'dcn_tree_predicted')

Accuracy: 90.14%
---
Confusion Matrix
actual              False  True 
dcn_tree_predicted              
False                4744    572
True                  343   3617
---
              precision    recall  f1-score   support

       False       0.89      0.93      0.91      5087
        True       0.91      0.86      0.89      4189

    accuracy                           0.90      9276
   macro avg       0.90      0.90      0.90      9276
weighted avg       0.90      0.90      0.90      9276



# Random Forest

In [21]:
rand_forest = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion='gini', min_samples_leaf=3, \
                                     n_estimators=100, max_depth=15, random_state=1414)
rand_forest = rand_forest.fit(X_train, y_train)

train['rand_forest_predicted'] = rand_forest.predict(X_train)
validate['rand_forest_predicted'] = rand_forest.predict(X_validate)

In [22]:
utils.print_model_evaluation(train, 'rand_forest_predicted')

Accuracy: 96.09%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                  11667    645
True                     202   9130
---
              precision    recall  f1-score   support

       False       0.95      0.98      0.96     11869
        True       0.98      0.93      0.96      9775

    accuracy                           0.96     21644
   macro avg       0.96      0.96      0.96     21644
weighted avg       0.96      0.96      0.96     21644



In [23]:
utils.print_model_evaluation(validate, 'rand_forest_predicted')

Accuracy: 94.91%
---
Confusion Matrix
actual                 False  True 
rand_forest_predicted              
False                   4951    336
True                     136   3853
---
              precision    recall  f1-score   support

       False       0.94      0.97      0.95      5087
        True       0.97      0.92      0.94      4189

    accuracy                           0.95      9276
   macro avg       0.95      0.95      0.95      9276
weighted avg       0.95      0.95      0.95      9276



# K Nearest Neighbors

In [24]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn = knn.fit(X_train, y_train)

train['knn_predicted'] = knn.predict(X_train)
validate['knn_predicted'] = knn.predict(X_validate)

In [25]:
utils.print_model_evaluation(train, 'knn_predicted')

Accuracy: 88.66%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False          11551   2136
True             318   7639
---
              precision    recall  f1-score   support

       False       0.84      0.97      0.90     11869
        True       0.96      0.78      0.86      9775

    accuracy                           0.89     21644
   macro avg       0.90      0.88      0.88     21644
weighted avg       0.90      0.89      0.88     21644



In [26]:
utils.print_model_evaluation(validate, 'knn_predicted')

Accuracy: 84.22%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False           4881   1258
True             206   2931
---
              precision    recall  f1-score   support

       False       0.80      0.96      0.87      5087
        True       0.93      0.70      0.80      4189

    accuracy                           0.84      9276
   macro avg       0.86      0.83      0.83      9276
weighted avg       0.86      0.84      0.84      9276



# MVP

In [27]:
test['log_predicted'] = lm.predict(X_test)

In [28]:
utils.print_model_evaluation(test, 'log_predicted')

Accuracy: 97.68%
---
Confusion Matrix
actual         False  True 
log_predicted              
False           4180    119
True              60   3372
---
              precision    recall  f1-score   support

       False       0.97      0.99      0.98      4240
        True       0.98      0.97      0.97      3491

    accuracy                           0.98      7731
   macro avg       0.98      0.98      0.98      7731
weighted avg       0.98      0.98      0.98      7731

