# Intro

This notebook will handle everything related to modeling the data to fit into the Kaggle competition. For this, we have a simple workflow. We start by loading the training data and fit several models to it. Afterwards, we apply these trained models to the test data and evaluate their results, choosing the best one and submitting it to Kaggle. 

The criteria for evaluation will be the Area Under the Curve (AUC)

Let's get this started then. We start by handling all the imports necessary.

In [1]:
# First we will handle all the imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt   
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import metrics

We can now start loading the actual data that will be used in the models

# TODO: apply SMOTE to the train classes and test_train splitting. and drop all the string things that aren't necessary

In [40]:
train_data = pd.read_csv("../data/train.csv", sep=",")
test_data = pd.read_csv("../data/test.csv", sep=",")

# Separate into features and labels
unwanted = ["status", "loan_id", "frequency", "loan_date"]
features = [x for x in list(train_data) if x not in unwanted]
target = ["status"]

# Order the entries in the dataframe by time,
# so the Time Splitter works properly
train_data.sort_values(by="loan_date", inplace=True)
test_data.sort_values(by="loan_date", inplace=True)

X = train_data[features]
Y = train_data[target]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

# Apply SMOTE so the training classes are balanced
smote = SMOTE(random_state=0)
x_train, y_train = smote.fit_resample(x_train, y_train)

np.bincount(y_train)

ValueError: could not convert string to float: 'M'

# Training a Model and finding the best parameters

With the data loaded, we can start trying to find the best model to predict the results. To find the best hyperparameters for each model, we will use Grid Searching, as randomized searches would not yield any better results, and evaluate the results using the AUC metric. As for splitting, we will use the `TimeSeriesSplit`, as it splits the data taking in account timed events.

In [10]:
SPLITTER = TimeSeriesSplit()

# Don't forget that here go the training X and y split values from test_train_split

def train_model_grid(model, params, cv=SPLITTER):
    grid_search = GridSearchCV(estimator=model,
                               param_grid=params,
                               n_jobs=-1, cv=cv,
                               scoring='roc_auc',
                               verbose=2)
    
    grid_search.fit(x_train, y_train)
    print(f"Best params for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_}")
    
    return grid_search.best_estimator_

# Evaluating a model to check for applicability

With the model trained, we can check the results on the testing data and try to find out which one performs the best out of all of them. We will once again use the Area Under the Curve (AUC) as our metric, and we will show a Confusion Matrix to detail exactly what happenned with the data, and decide if it is a good fit or not.

In [26]:
# Be sure to use the X_test and Y_test from the train_test_split when doing this

def evaluate(model):
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:, -1]
    
    # Area Under the Curve, the higher the better
    auc = metrics.roc_auc_score(y_test, y_pred)
    print(f"AUC Score: {auc}")

    y_pred_normalized = np.argmax(model.predict_proba(X_test), axis=1)

    cm = metrics.confusion_matrix(y_test, y_pred_normalized)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues");

    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Accepted', 'Not Accepted']); ax.yaxis.set_ticklabels(['Accepted', 'Not Accepted']);
    
    return y_pred

# Exporting the Data

With the model trained, we can now look to apply it to the competition data as well, and (ideally) obtain great results with it as well

In [12]:
# The test_df is no more than the dataframe built with the competition data, so yeah, that should be easy

def export_results(model, file):
    test = test_df[features]

    confidences = model.predict_proba(test)
    confidences = confidences[:,-1]

    confidences = [0 if x < 0.000001 else x for x in confidences]
    confidences = ["{:f}".format(x) for x in confidences]

    submition_data = pd.DataFrame()

    submition_data["Id"] = test_df["loan_id"]
    submition_data["Predicted"] = confidences
    submition_data.to_csv(f"results/{file}.csv", sep=",", index=False)
    
    print(f"Done exporting to: {file}.csv")

# Joining it all together

With this wrapper function, it becomes much easier to evaluate all the models

In [13]:
# The predicted results might be useful if we wish to compare the different models
def apply(model, params):
    best_model = train_model_grid(model, params)
    predicted_results = evaluate(best_model)
    export_results(model, model.__class__.__name__)
    
    return predicted_results

In [14]:
apply(DecisionTreeClassifier(), {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1,20),
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(1,6)
})

Fitting 5 folds for each of 1520 candidates, totalling 7600 fits
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=entropy, max_depth=15, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterio

[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9;

[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=entropy, max_depth=18, min_samples_leaf=4, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=18, min_samples_leaf=4, min_samples_split=2; total time=   0.0s
[CV] END criterion=entropy, max_depth=18, min_samples_leaf=4, min_samples_split=3;

[CV] END criterion=gini, max_depth=1, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=4, min_samples_split=6; total time=   0.0s
[CV] END criterion=g

[CV] END criterion=gini, max_depth=2, min_samples_leaf=3, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=3, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=2, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=3, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=g

[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=g

[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=6, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=g

[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=3, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=9, min_samples_leaf=4, min_samples_split=2; total time=   0.0s
[CV] END criterion=g

[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=6; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=12, min_samples_leaf=5, min_samples_split=7; total time=   0.0s
[CV] END c

[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=13, min_samples_leaf=5, min_samples_split=4; total time=   0.0s
[CV] END c

[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=3; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=4; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=16, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END c

[CV] END criterion=gini, max_depth=18, min_samples_leaf=2, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=2, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=2, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=2, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=3, min_samples_split=3; total time=   0.0s
[CV] END c

[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=7; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=gini, max_depth=18, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END c

[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=6; total tim

[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=4, min_samples_leaf=4, min_samples_split=9; total tim

[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=8, min_samples_leaf=3, min_samples_split=7; total tim

[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=6; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=7, min_samples_leaf=1, min_samples_split=7; total tim

[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, min_samples_leaf=4, min_samples_split=9;

[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=7; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=12, min_samples_leaf=4, min_samples_split=9;

[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=8; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9; total time=   0.0s
[CV] END criterion=entropy, max_depth=16, min_samples_leaf=1, min_samples_split=9;

ValueError: 
All the 7600 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7600 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 969, in fit
    super().fit(
  File "/usr/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 172, in fit
    X, y = self._validate_data(
  File "/usr/lib/python3.10/site-packages/sklearn/base.py", line 591, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/usr/lib/python3.10/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/krypt0/.local/lib/python3.10/site-packages/pandas/core/generic.py", line 2069, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'M'


# TODO: Start applying models, with different params, and evaluating the results. Submit the best one. Also, don't forget to upload the data kkkkk

# TODO(MAYBE): Add this ROC comparing thingy

In [None]:
import numpy as np
np.random.seed(2018)

from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib
import matplotlib.pyplot as plt

def roc_curve_and_score(y_test, pred_proba):
    fpr, tpr, _ = roc_curve(y_test.ravel(), pred_proba.ravel())
    roc_auc = roc_auc_score(y_test.ravel(), pred_proba.ravel())
    return fpr, tpr, roc_auc


plt.figure(figsize=(10, 8))
matplotlib.rcParams.update({'font.size': 14})
plt.grid()

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_rfc)
plt.plot(fpr, tpr, color='darkorange', lw=2, label='RFC({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_svm)
plt.plot(fpr, tpr, color='red', lw=2, label='SVM({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_xgb)
plt.plot(fpr, tpr, color='blue', lw=2, label='XGB({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_neural)
plt.plot(fpr, tpr, color='purple', lw=2, label='NEURAL({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_knn)
plt.plot(fpr, tpr, color='crimson', lw=2, label='KNN({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_dtc)
plt.plot(fpr, tpr, color='yellow', lw=2, label='DTC({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_gp)
plt.plot(fpr, tpr, color='green', lw=2, label='GP({0:.3f})'.format(roc_auc))

fpr, tpr, roc_auc = roc_curve_and_score(y_test, y_pred_mlp)
plt.plot(fpr, tpr, color='black', lw=2, label='MLP({0:.3f})'.format(roc_auc))

plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.legend(loc="lower right")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity')
plt.ylabel('Sensitivity')
plt.show()