In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
df = pd.read_csv('data/explored_cmv&unpop_data')

In [None]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [None]:
x_list = df.select_dtypes(exclude = np.dtype('O')).columns.tolist()
x_list.remove('post_subreddit')
X = df[x_list]
y = df['post_subreddit']

X = X.astype(np.dtype('float64'))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

- Dummy

In [None]:
df['post_subreddit'].value_counts(normalize = True)

- XGBoost

In [None]:
xgb_pipe = Pipeline([
    ('xgb', XGBClassifier(base_score = .563149, random_state = 42, booster = 'gbtree', use_label_encoder = False, verbosity = 0))
])

xgb_params = {
    'xgb__n_estimators': Integer(1, 100),
    'xgb__max_depth': Integer(1, 100),
    'xgb__max_delta_step': Real(0, .5),
    'xgb__min_child_weight': Real(0, 2, prior='uniform'),
    'xgb__learning_rate': Real(0, .2, prior='uniform')
}

xgb_bs = BayesSearchCV(estimator = xgb_pipe,
                     search_spaces = xgb_params,
                     scoring = 'f1',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     verbose = 1,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

xgb_bs_rocauc = BayesSearchCV(estimator = xgb_pipe,
                     search_spaces = xgb_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     verbose = 1,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
xgb_bs.fit(X_train, y_train)

In [None]:
xgb = xgb_bs.best_estimator_['xgb']

In [None]:
pickle.dump(xgb, open('models/XGBoost.sav', 'wb'))

In [None]:
xgb_bs_rocauc.fit(X_train, y_train)

In [None]:
xgb_rocauc = xgb_bs_rocauc.best_estimator_['xgb']

In [None]:
pickle.dump(xgb_rocauc, open('models/XGBoost_rocauc.sav', 'wb'))

- Logistic Regression

In [None]:
logreg_pipe = Pipeline([
    ('logreg', LogisticRegression(random_state = 42, solver = 'saga'))
])

logreg_params = {
    'logreg__penalty': Categorical(['l1', 'l2', 'elasticnet']),
    'logreg__tol': Real(0, .1, prior='uniform'),
    'logreg__C': Real(0.0001, 100, prior='log-uniform'),
    'logreg__class_weight': Categorical(['balanced', None]),
    'logreg__max_iter': Integer(500, 1000),
    'logreg__l1_ratio': Real(0, 1, prior='uniform')
}

logreg_bs = BayesSearchCV(estimator = logreg_pipe,
                     search_spaces = logreg_params,
                     scoring = 'f1',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

logreg_bs_rocauc = BayesSearchCV(estimator = logreg_pipe,
                     search_spaces = logreg_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
logreg_bs.fit(X_train, y_train)

In [None]:
logreg = logreg_bs.best_estimator_['logreg']

In [None]:
pickle.dump(logreg, open('models/LogisticRegression.sav', 'wb'))

In [None]:
logreg_bs_rocauc.fit(X_train, y_train)

In [None]:
logreg_rocauc = logreg_bs_rocauc.best_estimator_['logreg']

In [None]:
pickle.dump(logreg_rocauc, open('models/LogisticRegression_rocauc.sav', 'wb'))

- Decision Tree

In [None]:
dtree_pipe = Pipeline([
    ('dtree', DecisionTreeClassifier(random_state = 42))
])

dtree_params = {
    'dtree__criterion': Categorical(['gini', 'entropy']),
    'dtree__splitter': Categorical(['best', 'random']),
    'dtree__max_depth': Integer(1, 100),
    'dtree__min_samples_split': Real(0, .5),
    'dtree__min_samples_leaf': Real(0, .5),
    'dtree__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'dtree__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'dtree__ccp_alpha': Real(0, .2, prior='uniform')
}

dtree_bs = BayesSearchCV(estimator = dtree_pipe,
                     search_spaces = dtree_params,
                     scoring = 'f1',
                     n_iter = 50,
                     cv = 5,
                     n_jobs = 8,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

dtree_bs_rocauc = BayesSearchCV(estimator = dtree_pipe,
                     search_spaces = dtree_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     cv = 5,
                     n_jobs = 8,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
dtree_bs.fit(X_train, y_train)

In [None]:
dtree = dtree_bs.best_estimator_['dtree']

In [None]:
pickle.dump(dtree, open('models/DecisionTree.sav', 'wb'))

In [None]:
dtree_bs_rocauc.fit(X_train, y_train)

In [None]:
dtree_rocauc = dtree_bs_rocauc.best_estimator_['dtree']

In [None]:
pickle.dump(dtree_rocauc, open('models/DecisionTree_rocauc.sav', 'wb'))

- Random Forest

In [None]:
rf_pipe = Pipeline([
    ('rf', RandomForestClassifier(random_state = 42))
])

rf_params = {
    'rf__criterion': Categorical(['gini', 'entropy']),
    'rf__n_estimators': Integer(5, 200),
    'rf__min_samples_split': Real(0, .5),
    'rf__min_samples_leaf': Real(0, .5),
    'rf__max_depth': Integer(1, 100),
    'rf__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'rf__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'rf__ccp_alpha': Real(0, .2, prior='uniform'),
    'rf__max_samples': Real(0, 1)
}

rf_bs = BayesSearchCV(estimator = rf_pipe,
                     search_spaces = rf_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     cv = 5,
                     n_jobs = 8,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

rf_bs_rocauc = BayesSearchCV(estimator = rf_pipe,
                     search_spaces = rf_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     cv = 5,
                     n_jobs = 8,
                     refit = True,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
rf_bs.fit(X_train, y_train)

In [None]:
rf = rf_bs.best_estimator_['rf']

In [None]:
pickle.dump(rf, open('models/RandomForest.sav', 'wb'))

In [None]:
rf_bs_rocauc.fit(X_train, y_train)

In [None]:
rf_rocauc = rf_bs_rocauc.best_estimator_['rf']

In [None]:
pickle.dump(rf_rocauc, open('models/RandomForest_rocauc.sav', 'wb'))

- Extra Trees

In [None]:
et_pipe = Pipeline([
    ('et', ExtraTreesClassifier(random_state = 42))
])

et_params = {
    'et__criterion': Categorical(['gini', 'entropy']),
    'et__n_estimators': Integer(5, 200),
    'et__min_samples_split': Real(0, .5),
    'et__min_samples_leaf': Real(0, .5),
    'et__max_depth': Integer(1, 100),
    'et__max_features': Categorical(['auto', 'sqrt', 'log2']),
    'et__min_impurity_decrease': Real(0, .2, prior='uniform'),
    'et__ccp_alpha': Real(0, .2, prior='uniform'),
    'et__max_samples': Real(0, 1)
}

et_bs = BayesSearchCV(estimator = et_pipe,
                     search_spaces = et_params,
                     scoring = 'f1',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     n_jobs = 8,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

et_bs_rocauc = BayesSearchCV(estimator = et_pipe,
                     search_spaces = et_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     cv = 5,
                     refit = True,
                     n_jobs = 8,
                     optimizer_kwargs = {'base_estimator': 'RF'},
                     random_state=42)

In [None]:
et_bs.fit(X_train, y_train)

In [None]:
et = et_bs.best_estimator_['et']

In [None]:
pickle.dump(et, open('models/ExtraTrees.sav', 'wb'))

In [None]:
et_bs_rocauc.fit(X_train, y_train)

In [None]:
et_rocauc = et_bs_rocauc.best_estimator_['et']

In [None]:
pickle.dump(et_rocauc, open('models/ExtraTrees_rocauc.sav', 'wb'))

- To step 6 ->