In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
def metrics(tn, fp, fn, tp, metric = ['accuracy']):
    answers = {}
    
    if 'accuracy' in metric or 'all' in metric:
        answers['accuracy'] = (tp + tn) / (tn + fn + fp + tp)
    if 'sensitivity' in metric  or 'all' in metric:
        answers['sensitivity'] = tp / (tp + fn)
    if 'specificity' in metric  or 'all' in metric:
        answers['specificity'] = tn / (tn + fp)
    if 'f1' in metric or 'all' in metric:
        answers['f1'] = tp / (tp + .5*(fp + fn))

    return answers

In [3]:
df = pd.read_csv('data/explored_cmv&unpop_data')

In [4]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [5]:
train = df[df['training_set'] == 1]
test = df[df['training_set'] == 0]

In [6]:
x_list = df.select_dtypes(exclude = np.dtype('O')).columns.tolist()
x_list.remove('post_subreddit')
X = df[x_list]
y = df['post_subreddit']

X = X.astype(np.dtype('float64'))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

- Dummy

In [7]:
df['post_subreddit'].value_counts(normalize = True)

0    0.563149
1    0.436851
Name: post_subreddit, dtype: float64

- XGBoost

In [8]:
xgb_pipe = Pipeline([
    ('xgb', XGBClassifier(base_score = .563149, random_state = 42, booster = 'gbtree', use_label_encoder = False, verbosity = 0))
])

xgb_params = {
    'xgb__n_estimators': Integer(1, 100),
    'xgb__max_depth': Integer(1, 100),
    'xgb__max_delta_step': Real(0, .5),
    'xgb__min_child_weight': Real(0, 2, prior='uniform'),
    'xgb__learning_rate': Real(0, .2, prior='uniform')
}

xgb_bs = BayesSearchCV(estimator = xgb_pipe,
                     search_spaces = xgb_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

- Bagger

In [None]:
bc_pipe = Pipeline([
    ('bc', BaggingClassifier(random_state = 42))
])

bc_params = {
    'bc__n_estimators': Integer(1, 100),
    'bc__max_samples': Real(0, 1, prior='uniform'),
    'bc__max_features': Real(0, 1, prior='uniform')
}

bc_bs = BayesSearchCV(estimator = bc_pipe,
                     search_spaces = bc_params,
                     n_jobs = 8,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     verbose = 1,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
bc_bs.fit(X_train, y_train)

In [None]:
bc_bs.best_params_

In [None]:
bc_preds = bc_bs.predict(X_test)

In [None]:
bc_tn, bc_fp, bc_fn, bc_tp = confusion_matrix(y_test, bc_preds).ravel()

In [None]:
metrics(bc_tn, bc_fp, bc_fn, bc_tp, metric = 'all')

In [None]:
bc = bc_bs.best_estimator_['bc']

In [None]:
pickle.dump(bc, open('models/Bagger.sav', 'wb'))

- Logistic Regression

In [None]:
logreg_pipe = Pipeline([
    ('logreg', LogisticRegression(random_state = 42, solver = 'saga'))
])

logreg_params = {
    'logreg__penalty': Categorical(['l1', 'l2', 'elasticnet']),
    'logreg__tol': Real(0, .1, prior='uniform'),
    'logreg__C': Real(0, 100, prior='uniform'),
    'logreg__class_weight': Categorical(['balanced', None]),
    'logreg__max_iter': Integer(0, 1000),
    'logreg__l1_ratio': Real(0, 1, prior='uniform')
}

logreg_bs = BayesSearchCV(estimator = logreg_pipe,
                     search_spaces = logreg_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

- Decision Tree

In [None]:
dtree_pipe = Pipeline([
    ('dtree', DecisionTreeClassifier(random_state = 42))
])

dtree_params = {
    'dtree__criterion': Categorical(['gini', 'entropy']),
    'dtree__splitter': Categorical(['best', 'random']),
    'dtree__max_depth': Integer(1, 100),
    'dtree__min_samples_split': Real(0, .5),
    'dtree__min_samples_leaf': Real(0, .5),
    'dtree__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'dtree__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'dtree__ccp_alpha': Real(0, .2, prior='uniform')
}

dtree_bs = BayesSearchCV(estimator = dtree_pipe,
                     search_spaces = dtree_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

- Random Forest

In [None]:
rf_pipe = Pipeline([
    ('rf', RandomForestClassifier(random_state = 42))
])

rf_params = {
    'rf__criterion': Categorical(['gini', 'entropy']),
    'rf__n_estimators': Integer(5, 200),
    'rf__min_samples_split': Real(0, .5),
    'rf__min_samples_leaf': Real(0, .5),
    'rf__max_depth': Integer(1, 100),
    'rf__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'rf__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'rf__ccp_alpha': Real(0, .2, prior='uniform'),
    'rf__max_samples': Real(0, 1)
}

rf_bs = BayesSearchCV(estimator = rf_pipe,
                     search_spaces = rf_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

- Extra Trees

In [None]:
et_pipe = Pipeline([
    ('et', ExtraTreesClassifier(random_state = 42))
])

et_params = {
    'et__criterion': Categorical(['gini', 'entropy']),
    'et__n_estimators': Integer(5, 200),
    'et__min_samples_split': Real(0, .5),
    'et__min_samples_leaf': Real(0, .5),
    'et__max_depth': Integer(1, 100),
    'et__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'et__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'et__ccp_alpha': Real(0, .2, prior='uniform'),
    'et__max_samples': Real(0, 1)
}

et_bs = BayesSearchCV(estimator = et_pipe,
                     search_spaces = et_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

- Compare?

In [None]:
roc_auc_score