In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
X_train = pd.read_csv('data/train/X_train.csv').values
X_test = pd.read_csv('data/test/X_test.csv').values
y_train = pd.read_csv('data/train/y_train.csv')
y_test = pd.read_csv('data/test/y_test.csv')

In [3]:
grid_search = {
    'rfc': {'classifier': RandomForestClassifier(),
    'params': {
        'bootstrap': [True, False],
        'max_depth': [1, 5, 10, 20, 30, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [1, 5, 10, 20, 30]
    }}
}

In [27]:
# grid_search = {
#     'logistic_regression': {'classifier': LogisticRegression(), 'params': {'C': np.logspace(-4, 4, 20)}},
#     'rfc': {'classifier': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
#     'xgb_classifier': {'classifier': XGBClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
#     'knn': {'classifier': KNeighborsClassifier(), 'params': {'n_neighbors': list(range(1, 10))}},
# }

In [None]:
scores = []

for classifier_name, classifier_params in grid_search.items():
    gs = GridSearchCV(classifier_params['classifier'],
                      classifier_params['params'],
                      cv=5,
                      scoring=['f1_micro', 'roc_auc'],
                      refit='f1_micro')
    
    gs.fit(X_train, y_train.values.ravel())
    
    scores.append({
        'classifier': classifier_name,
         'best_score': gs.best_score_,
         'best_params': gs.best_params_,
         'best_estimator': gs.best_estimator_
    })

In [40]:
scores_df = pd.DataFrame(scores).sort_values(by='best_score', ascending=False)
scores_df

Unnamed: 0,classifier,best_score,best_params,best_estimator
0,rfc,0.968613,"{'max_depth': 30, 'n_estimators': 20}","(DecisionTreeClassifier(max_depth=30, max_feat..."


In [41]:
classifier = scores_df.loc[scores_df.best_score.idxmax()].best_estimator
# classifier = scores_df.iloc[1].best_estimator
classifier.fit(X_train, y_train.values.ravel())

In [42]:
report = classification_report(y_test, classifier.predict(X_test), output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
False,0.976295,0.966084,0.971163,6693.0
True,0.965684,0.976012,0.970821,6545.0
accuracy,0.970993,0.970993,0.970993,0.970993
macro avg,0.970989,0.971048,0.970992,13238.0
weighted avg,0.971049,0.970993,0.970994,13238.0


In [43]:
roc_auc_score(y_test, classifier.predict(X_test))

0.9710480956980811

In [53]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib

In [56]:
confusion_matrix(y_test, classifier.predict(X_test))

array([[6466,  227],
       [ 157, 6388]], dtype=int64)