In [23]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score
)

### Logistic Regression Classifier for testing

In [32]:
# prepare data for logistic regression
def logi_reg(df, target, target_names, thresh_n):
    """You know what it does 

        Input: 
            df: pandas dataframe
            target: target column name
            target_names: the names of the target classes eg."Yes" and "No"
            thresh_n: threshold used on this dataset

        Output:
            None 
    """

    # one hot encoding on categorical features 
    features = df.drop(columns = [target])   
    df = pd.get_dummies(df, columns=features.select_dtypes(include=['object']).columns)

    # data split
    df_train, df_test = train_test_split(df, 
                                        test_size = 0.25, 
                                        random_state = 69,
                                        stratify = df[target])

    X_train = df_train.drop(columns = [target])
    y_train = df_train[target]
    X_test = df_test.drop(columns = [target])
    y_test = df_test[target]

    # print the size of the training and test set
    print (f"The threshold for removing NAs is {thresh_n}")
    print (f"Size of training set : {df_train.shape[0]} rows , {df_train.shape[1]} columns")
    print (f"Size of testing set : {df_test.shape[0]} rows , {df_test.shape[1]} columns")

    # fit logistic regression model with elastic net regularization
    log_reg = LogisticRegression(penalty = 'elasticnet',
                                solver = 'saga',
                                l1_ratio = 0.5,
                                max_iter = 1000)
    
    log_reg.fit(X_train, y_train)

    # predict on test set
    y_test_pred = log_reg.predict(X_test)
    y_pred_proba = log_reg.predict_proba(X_test)[:,1]

    # Evaluate performance
    # print('\n===============================\n'+
    #   'Classification report on test data' +
    #   '\n===============================\n')
    # print(classification_report(y_test, y_test_pred, target_names=target_names))

    print('\n===============================\n'+
        'Confusion matrix on test data' +
        '\n===============================\n')
    print(confusion_matrix(y_test, y_test_pred))
    
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Precision:", precision_score(y_test, y_test_pred, pos_label='Yes'))
    print("Recall:", recall_score(y_test, y_test_pred, pos_label='Yes'))
    print("F1 Score:", f1_score(y_test, y_test_pred, pos_label='Yes'))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))



### Logistic Regression 

### K-nearest Neighbors