In [None]:
from datetime import datetime as dt
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline as skl_Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imb_Pipeline

from pipeline_objects import PrepData, MyTargetEncoder, MultipurposeEncoder, make_evaluation_plots

In [None]:
day = dt.today().day
month = dt.today().month
year = dt.today().year

dt_str = os.path.join('.', 'reddit_data_{}-{}-{}'.format(month, day, year) )

df_orig = pd.read_pickle(dt_str)

In [None]:
ohe_enc_feats = ['contest_mode', 'edited', 'adult_content', 'oc', 'reddit_media', 'selfpost',
             'video', 'distinguished',
            ]
target_enc_feats = ['content_categories', 'subreddit', 'weekday', 'post_hour']

drop_feats = ['final_upvotes', 'final_num_comments', 'title', 'how_sorted']

In [None]:
data_prep = PrepData(drop_feats = drop_feats)
df_prepped = data_prep.fit_transform(df_orig)

X = df_prepped.drop(columns = 'gilded')
y = df_prepped['gilded']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify = y
                                                   )

In [None]:
dum = DummyClassifier(strategy = 'stratified')
dum.fit(X_train, y_train)

dum_predictions = dum.predict(X_test)

dum_prec = precision_score(y_test, dum_predictions)
dum_reca = recall_score(y_test, dum_predictions)

print('Precision is: {:.1f}%'.format(dum_prec*100))
print('Recall is: {:.1f}%'.format(dum_reca*100))

In [None]:
current_ratio = y_train.sum()/(~y_train).sum()

xgb = XGBClassifier(eval_metric = 'aucpr', use_label_encoder = False, verbosity = 0 )

#rmf = RandomForestClassifier(n_jobs = -1)
over = SMOTE()
under = RandomUnderSampler()


resample_pipe = imb_Pipeline([('over', over),
                              ('under', under),
                              ('model', xgb)
                             ])

pipe = skl_Pipeline([('categ_enc', MultipurposeEncoder(ohe_feats = ohe_enc_feats, 
                                                       target_feats = target_enc_feats,
                                                       target_how = 'additive_smoothing',
                                                      )),
                     ('scaler', StandardScaler()),
                     #('decomp', PCA()),
                     ('resample_classify', resample_pipe)
                    ])


# https://xgboost.readthedocs.io/en/latest/parameter.html
parameters = {'resample_classify__model__eta':[0.25],
              'resample_classify__model__gamma':[2],
              'resample_classify__model__min_child_weight':[5],              
              'resample_classify__model__max_depth': [3],
              'resample_classify__model__max_delta_step': [10],
              'resample_classify__model__subsample': [1],
              'resample_classify__model__sampling_method': ['uniform'],
              'resample_classify__model__lambda': [10],
              'resample_classify__model__alpha': [0],
              'resample_classify__model__scale_pos_weight': [2],              
              'resample_classify__over__sampling_strategy': [current_ratio*1.01],
              'resample_classify__under__sampling_strategy': [current_ratio*1.01],              
              'categ_enc__target_weight': [0], 
             # 'decomp__n_components': [0.99]
             }


clf = GridSearchCV(pipe,
                   parameters,
                   n_jobs = -1,
                   #scoring = pr_auc_scorer,
                   scoring = 'average_precision',
                   verbose = 1,
                  )
_ = clf.fit(X_train, y_train)

thresh = 0.10
make_evaluation_plots(clf, X_train, y_train, X_test, y_test, thresh = thresh)

cv_fits = pd.DataFrame(clf.cv_results_)
cv_fits.iloc[:,-8:].sort_values('rank_test_score', ascending = True)

In [None]:
clf.best_params_

In [None]:
feat_imp = clf.best_estimator_['resample_classify']['model'].feature_importances_
pd.Series(feat_imp, index = X_train.columns).sort_values(ascending = False)

In [None]:
clf.predict_proba(X)

In [None]:
a = pd.Series([1,4,5,7,8])

In [None]:
a.name = 'poop'

In [None]:
a

In [None]:
probas = [x[1] for x in clf.predict_proba(X_test)] 
predictions = [True if x > thresh else False for x in probas]

outcomes = pd.DataFrame({'gilded': y_test, 'predicted': predictions, 'probas': probas})
pos_outcomes = outcomes[ outcomes['predicted'] == True]
neg_outcomes = outcomes[ outcomes['predicted'] == False]

predicted_positive = (pos_outcomes
                      .groupby([pd.cut(pos_outcomes['probas'], bins=10)])
                      .sum()
                      .iloc[:,:-1]
                      .reset_index()
                     )
predicted_positive['center'] = predicted_positive['probas'].apply(lambda x: (x.left + x.right)/2)
predicted_positive['false pos'] = predicted_positive['predicted'] - predicted_positive['gilded']

predicted_negative = (neg_outcomes
                      .groupby([pd.cut(neg_outcomes['probas'], bins=10)])
                      .agg(['count', 'sum'])
                      .iloc[:,:2]
                      .reset_index()
                     )
predicted_negative['center'] = predicted_negative['probas'].apply(lambda x: (x.left + x.right)/2)
predicted_negative['true neg'] = predicted_negative[('gilded', 'count')] - predicted_negative[('gilded', 'sum')]
predicted_negative['false neg'] = predicted_negative[('gilded', 'sum')]


fig, (ax1, ax2) = plt.subplots(1,2, figsize = (15,5))
ax1.bar(predicted_positive['center'],
       predicted_positive['gilded'],
       width=0.02,
       color = 'blue',
       alpha = 0.7,
       label = 'True positives'
      )
ax1.bar(predicted_positive['center'],
       predicted_positive['false pos'], 
       bottom = predicted_positive['gilded'],
       width=0.02,
       color = 'red',
       alpha = 0.7,
       label = 'False positives'
      )
ax1.set_xticks(predicted_positive['center'])
ax1.set_xticklabels([round(x,2) for x in predicted_positive['center']])
ax1.legend(loc = 'upper right')
ax1.set_title('Predict: Gilded')
ax1.set_xlabel('Predicted Probability of Gilding')

ax2.bar(predicted_negative['center'],
       predicted_negative['true neg'],
       width=0.007,
       color = 'blue',
       alpha = 0.7,
       label = 'True negatives'
      )
ax2.bar(predicted_negative['center'],
       predicted_negative['false neg'], 
       bottom = predicted_negative['true neg'],
       width=0.007,
       color = 'red',
       alpha = 0.7,
       label = 'False negatives'
      )
ax2.set_xticks(predicted_negative['center'])
ax2.set_xticklabels([round(x,2) for x in predicted_negative['center']])
ax2.legend(loc = 'upper right')
ax2.set_yscale('log')
ax2.set_title('Predict: Not Gilded')
ax2.set_xlabel('Predicted Probability of Gilding')


plt.show()