In [None]:
from matplotlib import pyplot as plt
from tabulate import tabulate
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.dummy import DummyClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

plt.style.use('dark_background')
sns.set_palette("Set1")

In [None]:
tr_f_df = pd.read_csv(filepath_or_buffer='./data/train_fea.csv')
cv_f_df = pd.read_csv(filepath_or_buffer='./data/cv_fea.csv')
te_f_df = pd.read_csv(filepath_or_buffer='./data/test_fea.csv')

In [None]:
fea_cols = list(tr_f_df.columns)
target = fea_cols.pop()
labels = cv_f_df['class'].unique()

In [None]:
X_train = tr_f_df[fea_cols].values
y_train = tr_f_df[target].values

X_cv = cv_f_df[fea_cols].values
y_cv = cv_f_df[target].values

X_test = te_f_df[fea_cols].values
y_test = te_f_df[target].values

In [None]:
def plot_heatmap(matrix, title, labels):
    sns.heatmap(data=matrix, annot=True, fmt='.2f', linewidths=0.1,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel(xlabel='Predicted Class')
    plt.ylabel(ylabel='Actual Class')
    plt.title(label=title, fontsize=10)

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels):
    cmat = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)
    pmat = cmat / cmat.sum(axis=0)
    print("Column sum of precision matrix: {}".format(pmat.sum(axis=0)))
    rmat = ((cmat.T) / (cmat.sum(axis=1).T)).T
    print("Row sum of recall matrix:       {}".format(rmat.sum(axis=1)))
    
    plt.figure(figsize=(15, 3))
    plt.subplot(131)
    plot_heatmap(matrix=cmat, title='Confusion Matrix', labels=labels)
    plt.subplot(132)
    plot_heatmap(matrix=pmat, title='Precision Matrix', labels=labels)
    plt.subplot(133)
    plot_heatmap(matrix=rmat, title='Recall Matrix', labels=labels)
    plt.show()

In [None]:
def reporter(clf, X, y, title, labels, best=None):
    pred = clf.predict_proba(X=X)
    
    loss = log_loss(y_true=y, y_pred=pred)
    loss = np.round(a=loss, decimals=3)
    
    cm_pred = clf.predict(X=X)
    
    print(title)
    if best is None:
        print("Logloss: {}".format(loss))
    else:
        print("Logloss: {}".format(loss))
        print("Best parameters: {}".format(best))
    
    plot_confusion_matrix(y_true=y, y_pred=cm_pred, labels=labels)
    
    print(classification_report(y_true=y, y_pred=cm_pred))
    
    return loss

In [None]:
def tuner(clf, dist, X, y):
    rs_clf = RandomizedSearchCV(estimator=clf, random_state=0, n_jobs=-1,
                                param_distributions=dist)
    search = rs_clf.fit(X=X, y=y)
    return search.best_params_

In [None]:
def get_model_path(model_name):
    if os.path.isdir('./model_dumps'):
        pass
    else:
        os.mkdir(path='./model_dumps')
    
    model_path = os.path.join('./model_dumps', model_name)
    return model_path

In [None]:
def dummy_classifier(X_train,
                     y_train,
                     X_cv,
                     y_cv,
                     X_test,
                     y_test,
                     model_name,
                     labels=labels):
    model_path = get_model_path(model_name=model_name)
    
    if not os.path.isfile(path=model_path):
        clf = DummyClassifier(strategy='uniform')
        clf.fit(X=X_train, y=y_train)
        
        with open(file=model_path, mode='wb') as m_pkl:
            pickle.dump(obj=clf, file=m_pkl)
        print("Model saved into the disk.\n")
    else:
        with open(file=model_path, mode='rb') as m_pkl:
            clf = pickle.load(file=m_pkl)
        print("Loaded the saved model from the disk.\n")
    
    tr_loss = reporter(clf=clf, X=X_train, y=y_train,
                       title='Train', labels=labels)
    cv_loss = reporter(clf=clf, X=X_cv, y=y_cv,
                       title='Cross Validation', labels=labels)
    te_loss = reporter(clf=clf, X=X_test, y=y_test,
                       title='Test', labels=labels)
    
    return tr_loss, cv_loss, te_loss

In [None]:
model_name = 'model_dummy_classifier.pkl'

(dummy_tr_loss,
 dummy_cv_loss,
 dummy_te_loss) = dummy_classifier(X_train=X_train,
                                   y_train=y_train,
                                   X_cv=X_cv,
                                   y_cv=y_cv,
                                   X_test=X_test,
                                   y_test=y_test,
                                   model_name=model_name)

In [None]:
def logistic_regresson(X_train,
                       y_train,
                       X_cv,
                       y_cv,
                       X_test,
                       y_test,
                       dist,
                       model_name,
                       labels=labels):
    model_path = get_model_path(model_name=model_name)

    if not os.path.isfile(path=model_path):
        clf = LogisticRegression(n_jobs=-1, random_state=42, max_iter=1000, 
                                 class_weight='balanced')

        best = tuner(clf=clf, dist=dist, X=X_train, y=y_train)

        clf = LogisticRegression(n_jobs=-1, max_iter=1000, C=best['C'],
                                 random_state=42, penalty=best['penalty'],
                                 class_weight='balanced')
        clf.fit(X=X_train, y=y_train)

        sig_clf = CalibratedClassifierCV(base_estimator=clf)
        sig_clf.fit(X=X_train, y=y_train)

        with open(file=model_path, mode='wb') as m_pkl:
            pickle.dump(obj=(clf, sig_clf, best), file=m_pkl)
        print("Model saved into the disk.\n")
    else:
        with open(file=model_path, mode='rb') as m_pkl:
            clf, sig_clf, best = pickle.load(file=m_pkl)
        print("Loaded the saved model from the disk.\n")
    
    tr_loss = reporter(clf=sig_clf, X=X_train, y=y_train,
                       title='Train', best=best, labels=labels)
    cv_loss = reporter(clf=sig_clf, X=X_cv, y=y_cv,
                       title='Cross Validation', best=best, labels=labels)
    te_loss = reporter(clf=sig_clf, X=X_test, y=y_test,
                       title='Test', best=best, labels=labels)
    
    return best, tr_loss, cv_loss, te_loss

In [None]:
model_name = 'model_logistic_regression.pkl'

dist = dict(C=[10 ** x for x in range(-4, 3)], penalty=['l2', 'l1'])

(logreg_best,
 logreg_tr_loss,
 logreg_cv_loss,
 logreg_te_loss) = logistic_regresson(X_train=X_train,
                                      y_train=y_train,
                                      X_cv=X_cv,
                                      y_cv=y_cv,
                                      X_test=X_test,
                                      y_test=y_test,
                                      dist=dist,
                                      model_name=model_name)

In [None]:
def support_vector_classifier(X_train,
                              y_train,
                              X_cv,
                              y_cv,
                              X_test,
                              y_test,
                              dist,
                              model_name,
                              labels=labels):
    model_path = get_model_path(model_name=model_name)

    if not os.path.isfile(path=model_path):
        clf = SVC(random_state=42, class_weight='balanced')

        best = tuner(clf=clf, dist=dist, X=X_train, y=y_train)

        clf = SVC(C=best['C'], random_state=42, class_weight='balanced')
        clf.fit(X=X_train, y=y_train)

        sig_clf = CalibratedClassifierCV(base_estimator=clf)
        sig_clf.fit(X=X_train, y=y_train)

        with open(file=model_path, mode='wb') as m_pkl:
            pickle.dump(obj=(clf, sig_clf, best), file=m_pkl)
        print("Model saved into the disk.\n")
    else:
        with open(file=model_path, mode='rb') as m_pkl:
            clf, sig_clf, best = pickle.load(file=m_pkl)
        print("Loaded the saved model from the disk.\n")
    
    tr_loss = reporter(clf=sig_clf, X=X_train, y=y_train,
                       title='Train', best=best, labels=labels)
    cv_loss = reporter(clf=sig_clf, X=X_cv, y=y_cv,
                       title='Cross Validation', best=best, labels=labels)
    te_loss = reporter(clf=sig_clf, X=X_test, y=y_test,
                       title='Test', best=best, labels=labels)
    
    return best, tr_loss, cv_loss, te_loss

In [None]:
model_name = 'model_support_vector_classifier.pkl'

dist = dict(C=[10 ** x for x in range(-4, 3)])
"""
(svc_best,
 svc_tr_loss,
 svc_cv_loss,
 svc_te_loss) = support_vector_classifier(X_train=X_train,
                                          y_train=y_train,
                                          X_cv=X_cv,
                                          y_cv=y_cv,
                                          X_test=X_test,
                                          y_test=y_test,
                                          dist=dist,
                                          model_name=model_name)
"""
# svc is broken, so skipping it.

In [None]:
def k_neighbors_classifier(X_train,
                           y_train,
                           X_cv,
                           y_cv,
                           X_test,
                           y_test,
                           dist,
                           model_name,
                           labels=labels):
    model_path = get_model_path(model_name=model_name)

    if not os.path.isfile(path=model_path):
        clf = KNeighborsClassifier(n_jobs=-1)

        best = tuner(clf=clf, dist=dist, X=X_train, y=y_train)

        clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=best['n_neighbors'])
        clf.fit(X=X_train, y=y_train)

        sig_clf = CalibratedClassifierCV(base_estimator=clf)
        sig_clf.fit(X=X_train, y=y_train)

        with open(file=model_path, mode='wb') as m_pkl:
            pickle.dump(obj=(clf, sig_clf, best), file=m_pkl)
        print("Model saved into the disk.\n")
    else:
        with open(file=model_path, mode='rb') as m_pkl:
            clf, sig_clf, best = pickle.load(file=m_pkl)
        print("Loaded the saved model from the disk.\n")
    
    tr_loss = reporter(clf=sig_clf, X=X_train, y=y_train,
                       title='Train', best=best, labels=labels)
    cv_loss = reporter(clf=sig_clf, X=X_cv, y=y_cv,
                       title='Cross Validation', best=best, labels=labels)
    te_loss = reporter(clf=sig_clf, X=X_test, y=y_test,
                       title='Test', best=best, labels=labels)
    
    return best, tr_loss, cv_loss, te_loss

In [None]:

model_name = 'model_k_neighbors_classifier.pkl'

dist = dict(n_neighbors=[3, 5, 11, 15, 21, 31, 41, 51, 99])

(knn_best,
 knn_tr_loss,
 knn_cv_loss,
 knn_te_loss) = k_neighbors_classifier(X_train=X_train,
                                       y_train=y_train,
                                       X_cv=X_cv,
                                       y_cv=y_cv,
                                       X_test=X_test,
                                       y_test=y_test,
                                       dist=dist,
                                       model_name=model_name)

In [None]:
def feature_importance_plot(data, x, y, title):
    bars = sns.barplot(data=data, x=x, y=y)
    for b in bars.patches:
        x = b.get_x() + (b.get_width() / 2)
        y = np.round(b.get_height(), 3)
        bars.annotate(text=format(y),
                      xy=(x, y), ha='center', va='center', size=8, 
                      xytext=(0, 6), textcoords='offset points')
    plt.title(label=title)

In [None]:
# do other feature importance stuff
fi_cols = ['redshift', 'g-r', 'i-z', 'u-r', 'i-r', 'z-r', 'g']

In [None]:
fi_tr_data = tr_f_df[fi_cols]
fi_cv_data = cv_f_df[fi_cols]
fi_te_data = te_f_df[fi_cols]

In [None]:

def export_data(data, target_arr, filename):
    if os.path.isdir('./data/fi_data'):
        pass
    else:
        os.mkdir(path='./data/fi_data')
    
    data['class'] = target_arr
    data.to_csv(path_or_buf=os.path.join('./data/fi_data', filename), index=None)
    print("The data is exported to '{}'.".format(filename))

In [None]:
export_data(data=fi_tr_data, target_arr=y_train, filename='fi_tr_data.csv')
export_data(data=fi_cv_data, target_arr=y_cv, filename='fi_cv_data.csv')
export_data(data=fi_te_data, target_arr=y_test, filename='fi_te_data.csv')

In [None]:
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(estimator=model,
                             X=X, y=y, scoring='accuracy',
                             cv=cv, n_jobs=-1,
                             error_score='raise')
    return scores

In [None]:
def stacking_classifier(X_train,
                        y_train,
                        X_cv,
                        y_cv,
                        X_test,
                        y_test,
                        models,
                        model_name,
                        labels=labels):
    model_path = get_model_path(model_name=model_name)
    
    if not os.path.isfile(path=model_path):
        clf = StackingClassifier(estimators=models)
        clf.fit(X=X_train, y=y_train)

        with open(file=model_path, mode='wb') as m_pkl:
            pickle.dump(obj=clf, file=m_pkl)
        print("Model saved into the disk.\n")
    else:
        with open(file=model_path, mode='rb') as m_pkl:
            clf = pickle.load(file=m_pkl)
        print("Loaded the saved model from the disk.\n")
    
    tr_loss = reporter(clf=clf, X=X_train, y=y_train,
                       title='Train', labels=labels)
    cv_loss = reporter(clf=clf, X=X_cv, y=y_cv,
                       title='Cross Validation', labels=labels)
    te_loss = reporter(clf=clf, X=X_test, y=y_test,
                       title='Test', labels=labels)
    
    return tr_loss, cv_loss, te_loss

In [None]:
model_name = 'model_stacking_classifier.pkl'

LR = LogisticRegression(penalty=logreg_best['penalty'],
                        C=logreg_best['C'],
                        class_weight='balanced',
                        random_state=42,
                        n_jobs=-1, max_iter=1000)


KNN = KNeighborsClassifier(n_neighbors=knn_best['n_neighbors'], n_jobs=-1)

models = [('LR', LR), ('KNN', KNN)]

(stack_tr_loss,
 stack_cv_loss,
 stack_te_loss) = stacking_classifier(X_train=X_train,
                                      y_train=y_train,
                                      X_cv=X_cv,
                                      y_cv=y_cv,
                                      X_test=X_test,
                                      y_test=y_test,
                                      models=models,
                                      model_name=model_name)

In [None]:
model_names = ['Logistic Regression',
               'K-Nearest Neighbors', 'Stacking Classifier']

tr_losses = [logreg_tr_loss, knn_tr_loss, stack_tr_loss]
cv_losses = [logreg_cv_loss, knn_cv_loss, stack_cv_loss]
te_losses = [logreg_te_loss, knn_te_loss, stack_te_loss]

summary_df = pd.DataFrame()
summary_df['Models'] = model_names
summary_df['Train Loss'] = tr_losses
summary_df['CV Loss'] = cv_losses
summary_df['Test Loss'] = te_losses

summary = tabulate(tabular_data=summary_df, headers='keys',
                   tablefmt='psql')
print(summary)

tidy = summary_df.melt(id_vars='Models').rename(columns=str.title)

plt.figure(figsize=(8, 4))
sns.barplot(data=tidy, x='Models', y='Value', hue='Variable', alpha=0.9)
plt.title(label='Logloss Obtained')
plt.xticks(rotation=90)
plt.show()