In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score,
    classification_report
)
from scipy.sparse import issparse
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
# indir = '/content/gdrive/MyDrive/Colab Notebooks/Data/'
# X_train = indir + 'X_train.csv'
# X_test = indir + 'X_test.csv'
# y_train = indir + 'y_train.csv'
# y_test = indir + 'y_test.csv'

# # read the data sets
# X_train = pd.read_csv(X_train)
# X_test = pd.read_csv(X_test)
# y_train = pd.read_csv(y_train)
# y_test = pd.read_csv(y_test)

In [20]:
indir = '../../Data/GoogleDrive/'
X_train = indir + 'X_train.csv'
X_test = indir + 'X_test.csv'
y_train = indir + 'y_train.csv'
y_test = indir + 'y_test.csv'

# read the data sets
X_train = pd.read_csv(X_train)
X_test = pd.read_csv(X_test)
y_train = pd.read_csv(y_train)
y_test = pd.read_csv(y_test)

In [21]:
# drop the first column
X_train = X_train.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]
y_train = y_train.iloc[:, 1:]
y_test = y_test.iloc[:, 1:]

### Logistic Regression Classifier for testing

In [14]:
# prepare data for logistic regression
def logi_reg(df, target, target_names, thresh_n):
    """You know what it does

        Input:
            df: pandas dataframe
            target: target column name
            target_names: the names of the target classes eg."Yes" and "No"
            thresh_n: threshold used on this dataset

        Output:
            None
    """

    # one hot encoding on categorical features
    features = df.drop(columns = [target])
    df = pd.get_dummies(df, columns=features.select_dtypes(include=['object']).columns)

    # data split
    df_train, df_test = train_test_split(df,
                                        test_size = 0.25,
                                        random_state = 69,
                                        stratify = df[target])

    X_train = df_train.drop(columns = [target])
    y_train = df_train[target]
    X_test = df_test.drop(columns = [target])
    y_test = df_test[target]

    # print the size of the training and test set
    print (f"The threshold for removing NAs is {thresh_n}")
    print (f"Size of training set : {df_train.shape[0]} rows , {df_train.shape[1]} columns")
    print (f"Size of testing set : {df_test.shape[0]} rows , {df_test.shape[1]} columns")

    # fit logistic regression model with elastic net regularization
    log_reg = LogisticRegression(penalty = 'elasticnet',
                                solver = 'saga',
                                l1_ratio = 0.5,
                                max_iter = 1000)

    log_reg.fit(X_train, y_train)

    # predict on test set
    y_test_pred = log_reg.predict(X_test)
    y_pred_proba = log_reg.predict_proba(X_test)[:,1]

    # Evaluate performance
    # print('\n===============================\n'+
    #   'Classification report on test data' +
    #   '\n===============================\n')
    # print(classification_report(y_test, y_test_pred, target_names=target_names))

    print('\n===============================\n'+
        'Confusion matrix on test data' +
        '\n===============================\n')
    print(confusion_matrix(y_test, y_test_pred))

    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Precision:", precision_score(y_test, y_test_pred, pos_label='Yes'))
    print("Recall:", recall_score(y_test, y_test_pred, pos_label='Yes'))
    print("F1 Score:", f1_score(y_test, y_test_pred, pos_label='Yes'))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))



In [22]:
def model_eval(model, testing_features, testing_labels):

    '''prints evaluation scores for machine learning models, including
       micro averaged f1 score, accuracy, precision, and recall

    Input arguments:
        model: machine learning model
        testing_features: features in the test set (array)
        testing_labels: labels in the test set

    Output:
        f1: micro averaged f1 score (float)

    '''

    pred = model.predict(testing_features)
    roc_score = roc_auc_score(testing_labels, pred)
    acc = accuracy_score(testing_labels, pred)
    report = classification_report(testing_labels, pred,output_dict = True)
    precision = report['0']['precision']
    recall = report['0']['recall']
    f1 = report['0']['f1-score']

    print('Model Performance')
    print('F1_score: ' + str(f1))
    print('Accuracy = '+ str(acc))
    print('ROC: ' + str(roc_score))
    print('Precision: ' + str(precision))
    print('Recall: ' + str(recall))

    return f1

### Logistic Regression

In [23]:
roc_auc_scorer = make_scorer(roc_auc_score,
                             needs_threshold = True,
                             multi_class = 'ovo')

# define a tuning grid for logistic regression
logi_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga'],
    'clf__class_weight': [None, 'balanced'],
}


# define a logistic regression model
log_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter = 1000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

# define a grid search with cross-validation
log_grid_search = GridSearchCV(estimator = log_pipe,
                               param_grid = logi_grid,
                               cv = cv,
                               scoring = roc_auc_scorer,
                               n_jobs = -1,
                               verbose = 2)

# fit the grid search
log_grid_search.fit(X_train, y_train)

# print the best parameters
print("Best parameters:", log_grid_search.best_params_)
print("Best cross-validation ROC AUC score: {:.2f}".format(log_grid_search.best_score_))



Fitting 5 folds for each of 48 candidates, totalling 240 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  11.9s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  15.9s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  15.9s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  16.3s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  16.3s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  16.3s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  16.5s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  16.6s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  17.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  19.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  10.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  11.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  17.5s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.6s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  23.5s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  25.4s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  22.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  29.9s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  37.0s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  37.9s
[CV] END clf__C=0.001, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  38.1s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  19.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  15.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  17.5s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  17.6s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  18.5s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  12.0s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  10.8s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  16.0s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  11.7s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  11.7s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  12.3s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  11.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  13.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  15.7s
[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  15.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  13.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  16.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  22.5s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  33.2s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  25.9s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  33.3s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  24.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  34.5s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  33.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  35.1s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  22.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  20.6s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  28.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  18.8s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  21.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.0s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  24.4s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.1s
[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  33.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  34.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  41.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  48.6s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  25.4s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  24.3s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  32.5s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  28.0s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  36.5s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  26.4s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  33.9s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  32.1s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 1.3min
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 1.3min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 1.4min
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  39.7s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  43.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  49.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  48.8s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=  48.7s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  25.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  18.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  20.9s
[CV] END clf__C=0.01, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  21.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  25.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  34.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  34.0s
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  36.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  41.3s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.9min
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.7min
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.7min
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.6min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 3.6min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  20.1s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  11.8s
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  15.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  16.2s
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  15.8s
[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 3.8min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 4.5min
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  45.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  49.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 4.7min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 4.7min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  42.7s
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  42.6s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  39.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.7min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.2min
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.8min
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.2min
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.2min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  16.6s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  10.7s
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  11.5s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  12.4s
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  12.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 4.5min
[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 4.5min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 7.0min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.9min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 8.0min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 3.8min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=0.1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time= 9.1min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.8min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.2min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.7min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  27.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.6min
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  35.9s
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 2.1min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 1.8min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time= 1.4min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  21.1s
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  19.5s
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  22.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=14.9min
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=14.9min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 3.5min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 4.2min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 3.2min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=22.1min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 3.3min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 2.2min
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 2.0min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 3.9min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 2.4min
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 2.4min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time= 1.4min
[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=26.9min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=18.0min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  21.8s
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  22.6s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  21.0s
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  24.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  30.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  56.0s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 1.1min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 1.0min
[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time= 1.1min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  47.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  10.5s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=1, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=28.5min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  11.2s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  14.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  11.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  14.1s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  12.9s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  11.6s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  10.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  12.0s
[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=   8.9s
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=20.4min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  21.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  24.5s
[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  26.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  25.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  23.4s
[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  24.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.4min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  13.8s
[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  11.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  13.5s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   6.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   6.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.4min
[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  14.6s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   5.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   7.7s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   7.5s
[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 2.4min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  11.0s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  11.7s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  10.3s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  14.2s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  10.1s
[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  15.5s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  38.5s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  30.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  32.2s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  38.0s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear; total time=  32.5s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time= 3.2min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=   6.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  10.5s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=   7.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  10.6s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  10.5s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l1, clf__solver=saga; total time=  10.6s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=   7.4s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=10, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time= 1.4min


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=   9.8s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=liblinear; total time=  10.3s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  10.4s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  11.4s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  11.9s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  10.6s
[CV] END clf__C=100, clf__class_weight=None, clf__penalty=l2, clf__solver=saga; total time=  10.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  45.5s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  41.9s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  45.3s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  43.6s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=  42.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  40.8s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  41.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   6.8s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   7.1s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   6.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   6.8s


  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  12.2s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  12.3s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=liblinear; total time=   5.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l1, clf__solver=saga; total time=  14.1s
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=23.7min
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=   9.7s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=  10.0s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=   8.9s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=   7.5s
[CV] END clf__C=100, clf__class_weight=balanced, clf__penalty=l2, clf__solver=saga; total time=   7.8s
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=24.5min
[CV] END clf__C=1, clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear; total time=25.2min


  y = column_or_1d(y, warn=True)


Best parameters: {'clf__C': 0.001, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
Best cross-validation ROC AUC score: 0.84


Best parameters: {'clf__C': 0.001, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}  
Best cross-validation ROC AUC score: 0.84

In [None]:
# fit the training data with the best logistic regression model
log_final = LogisticRegression()
log_final.fit(X_train, y_train)

In [None]:
y_pred_test = log_final.predict(X_test)
y_pred_train = log_final.predict(X_train)

# plotting the ROC curve and printing the AUC score for the Random Forest Model
print('=========== Random Forest AUC score ==========')
print(metrics.roc_auc_score(y_test,y_pred_test))
print('===============================================\n')
log_y_score = log_final.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test,log_y_score)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title(' Logistic Regression ROC curve')


In [None]:
# Getting coefficients
logi_coef = log_final.coef_
avg_coef = np.mean(logi_coef, axis=0)

# Feature names for plotting
feature_names = data['feature_names']

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': ave_coef})
coef_df = coeff_df.sort_values(by = 'Coefficient', ascending = False)

plt.figure(figsize=(8, 6))
plt.barh(coeff_df['Feature'], coeff_df['Coefficient'], color='skyblue')
plt.xlabel('Coefficient Value')
plt.title('Logistic Regression Coefficients')
plt.grid(True)
plt.show()


### K-nearest Neighbors

In [None]:
# knn_train_score = {}
# knn_test_score = {}
# n_neighbors = np.arange(2, 15, 1)
# for neighbor in n_neighbors:
#     knn = KNeighborsClassifier(n_neighbors=neighbor)
#     knn.fit(X_train, y_train)
#     knn_train_score[neighbor]=knn.score(X_train, y_train)
#     knn_test_score[neighbor]=knn.score(X_test, y_test)

In [None]:
# plot the training and test scores

# Convert scores into a DataFrame
# scores_df = pd.DataFrame(
#     {'Neighbors': n_neighbors,
#      'Train Accuracy': [knn_train_score[n] for n in n_neighbors],
#      'Test Accuracy': [knn_test_score[n] for n in n_neighbors]}
#      )

# # Melt the DataFrame to make it suitable for Seaborn's lineplot
# knn_scores_df = pd.melt(scores_df, id_vars=['Neighbors'],
#                          var_name='Type', value_name='Accuracy')

# # Create the plot using Seaborn
# plt.figure(figsize = (8, 5))
# sns.lineplot(data = knn_scores_df, x = 'Neighbors',
#              y = 'Accuracy', hue = 'Type', marker='o')
# plt.title('KNN: Varying Number of Neighbors')
# plt.xlabel('Number of Neighbors')
# plt.ylabel('Accuracy')
# plt.xlim(1, 31)
# plt.ylim(0.60, 0.90)
# plt.grid(True)
# plt.legend(title='Data Type')
# plt.show()

In [26]:
knn_param_grid = {
    'knn__n_neighbors': np.arange(1, 25),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

knn_grid_search = GridSearchCV(knn_pipe,
                               knn_param_grid,
                               cv = cv,
                               verbose = 1,
                               scoring = roc_auc_scorer,
                               n_jobs = -1)

knn_grid_search.fit(X_train, y_train)

print("Best parameters:", knn_grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(knn_grid_search.best_score_))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


KeyboardInterrupt: 

In [None]:
best_knn = knn_grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
print("Test set accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))

In [None]:
# plot results
cv_results = pd.DataFrame(knn_grid_search.cv_results_)
uniform_results = cv_results[cv_results['param_knn__weights'] == 'uniform']
plt.figure(figsize=(10, 7))
plt.plot(uniform_results['param_knn__n_neighbors'], uniform_results['mean_train_score'], label='Train ROC AUC', marker='o')
plt.plot(uniform_results['param_knn__n_neighbors'], uniform_results['mean_test_score'], label='Validation ROC AUC', marker='o')
plt.title('KNN Performance Evaluation with Varying Neighbors')
plt.xlabel('Number of Neighbors')
plt.ylabel('ROC AUC')
plt.legend()
plt.grid(True)
plt.show()