In [31]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [23]:
X_train = pd.read_csv('data/train/X_train.csv').values
X_test = pd.read_csv('data/test/X_test.csv').values
y_train = pd.read_csv('data/train/y_train.csv')
y_test = pd.read_csv('data/test/y_test.csv')

In [30]:
grid_search = {
    'logistic_regression': {'classifier': LogisticRegression(), 'params': {'C': np.logspace(-4, 4, 20)}},
    'rfc': {'classifier': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
    'xgb_classifier': {'classifier': XGBClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
    'knn': {'classifier': KNeighborsClassifier(), 'params': {'n_neighbors': list(range(1, 31))}}
}

In [42]:
scores = []

for classifier_name, classifier_params in grid_search.items():
    gs = GridSearchCV(classifier_params['classifier'], classifier_params['params'], cv=5, scoring='f1_micro')
    gs.fit(X_train, y_train.values.ravel())
    scores.append(
        {'classifier': classifier_name,
         'best_score': gs.best_score_,
         'best_params': gs.best_params_,
         'best_estimator': gs.best_estimator_}
    )

In [43]:
scores_df = pd.DataFrame(scores).sort_values(by='best_score', ascending=False)
scores_df

Unnamed: 0,classifier,best_score,best_params,best_estimator
3,knn,0.961966,{'n_neighbors': 2},KNeighborsClassifier(n_neighbors=2)
2,xgb_classifier,0.944554,"{'max_depth': 10, 'n_estimators': 10}","XGBClassifier(base_score=None, booster=None, c..."
1,rfc,0.940531,"{'max_depth': 10, 'n_estimators': 10}","(DecisionTreeClassifier(max_depth=10, max_feat..."
0,logistic_regression,0.906349,{'C': 1.623776739188721},LogisticRegression(C=1.623776739188721)


In [44]:
classifier = scores_df.iloc[1].best_estimator
classifier.fit(X_train, y_train)

In [46]:
report = classification_report(y_test, classifier.predict(X_test), output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
False,0.950023,0.945764,0.947889,6693.0
True,0.944791,0.949121,0.946951,6545.0
accuracy,0.947424,0.947424,0.947424,0.947424
macro avg,0.947407,0.947443,0.94742,13238.0
weighted avg,0.947436,0.947424,0.947425,13238.0


In [45]:
roc_auc_score(y_test, classifier.predict(X_test))

0.947442849027472