In [39]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score
)
from scipy.sparse import issparse
from sklearn.neighbors import KNeighborsClassifier

In [14]:
# read the data sets
X_train = pd.read_csv('../../data/X_train.csv')
X_test = pd.read_csv('../../data/X_test.csv')
y_train = pd.read_csv('../../data/y_train.csv')
y_test = pd.read_csv('../../data/y_test.csv')

### Logistic Regression Classifier for testing

In [30]:
# prepare data for logistic regression
def logi_reg(df, target, target_names, thresh_n):
    """You know what it does 

        Input: 
            df: pandas dataframe
            target: target column name
            target_names: the names of the target classes eg."Yes" and "No"
            thresh_n: threshold used on this dataset

        Output:
            None 
    """

    # one hot encoding on categorical features 
    features = df.drop(columns = [target])   
    df = pd.get_dummies(df, columns=features.select_dtypes(include=['object']).columns)

    # data split
    df_train, df_test = train_test_split(df, 
                                        test_size = 0.25, 
                                        random_state = 69,
                                        stratify = df[target])

    X_train = df_train.drop(columns = [target])
    y_train = df_train[target]
    X_test = df_test.drop(columns = [target])
    y_test = df_test[target]

    # print the size of the training and test set
    print (f"The threshold for removing NAs is {thresh_n}")
    print (f"Size of training set : {df_train.shape[0]} rows , {df_train.shape[1]} columns")
    print (f"Size of testing set : {df_test.shape[0]} rows , {df_test.shape[1]} columns")

    # fit logistic regression model with elastic net regularization
    log_reg = LogisticRegression(penalty = 'elasticnet',
                                solver = 'saga',
                                l1_ratio = 0.5,
                                max_iter = 1000)
    
    log_reg.fit(X_train, y_train)

    # predict on test set
    y_test_pred = log_reg.predict(X_test)
    y_pred_proba = log_reg.predict_proba(X_test)[:,1]

    # Evaluate performance
    # print('\n===============================\n'+
    #   'Classification report on test data' +
    #   '\n===============================\n')
    # print(classification_report(y_test, y_test_pred, target_names=target_names))

    print('\n===============================\n'+
        'Confusion matrix on test data' +
        '\n===============================\n')
    print(confusion_matrix(y_test, y_test_pred))
    
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Precision:", precision_score(y_test, y_test_pred, pos_label='Yes'))
    print("Recall:", recall_score(y_test, y_test_pred, pos_label='Yes'))
    print("F1 Score:", f1_score(y_test, y_test_pred, pos_label='Yes'))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))



In [1]:
def model_eval(model, testing_features, testing_labels):
    
    '''prints evaluation scores for machine learning models, including 
       micro averaged f1 score, accuracy, precision, and recall 
    
    Input arguments:
        model: machine learning model
        testing_features: features in the test set (array)
        testing_labels: labels in the test set
        
    Output:
        f1: micro averaged f1 score (float)
       
    '''
    
    pred = model.predict(testing_features)
    roc_score = roc_auc_score(testing_labels, pred)
    acc = accuracy_score(testing_labels, pred)
    report = classification_report(testing_labels, pred,output_dict = True)
    precision = report['0']['precision']
    recall = report['0']['recall']
    f1 = report['0']['f1-score']
    
    print('Model Performance')
    print('F1_score: ' + str(f1))
    print('Accuracy = '+ str(acc))
    print('ROC: ' + str(roc_score))
    print('Precision: ' + str(precision))
    print('Recall: ' + str(recall))
    
    return f1

### Logistic Regression 

In [38]:
roc_auc_scorer = make_scorer(roc_auc_score, 
                             needs_threshold = True, 
                             multi_class = 'ovo')

# define a tuning grid for logistic regression
logi_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga'],
    'clf__class_weight': [None, 'balanced'],
}


# define a logistic regression model
log_pipe = Pipeline([
    ('scaler', StandardScaler()),  
    ('clf', LogisticRegression(max_iter = 1000)) 
])


# define a grid search with cross-validation
log_grid_search = GridSearchCV(estimator = log_pipe, 
                               param_grid = logi_grid, 
                               cv = 5, 
                               scoring = roc_auc_scorer, 
                               n_jobs = -1, 
                               verbose = 2)

# fit the grid search
log_grid_search.fit(X_train, y_train)

# print the best parameters
print("Best parameters:", log_grid_search.best_params_)
print("Best cross-validation ROC AUC score: {:.2f}".format(log_grid_search.best_score_))


Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

### K-nearest Neighbors

In [40]:
knn_train_score = {}
knn_test_score = {}
n_neighbors = np.arange(2, 15, 1)
for neighbor in n_neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    knn_train_score[neighbor]=knn.score(X_train, y_train)
    knn_test_score[neighbor]=knn.score(X_test, y_test)

In [None]:
# plot the training and test scores

# Convert scores into a DataFrame
scores_df = pd.DataFrame(
    {'Neighbors': n_neighbors,
     'Train Accuracy': [knn_train_score[n] for n in n_neighbors],
     'Test Accuracy': [knn_test_score[n] for n in n_neighbors]}
     )

# Melt the DataFrame to make it suitable for Seaborn's lineplot
knn_scores_df = pd.melt(scores_df, id_vars=['Neighbors'],
                         var_name='Type', value_name='Accuracy')

# Create the plot using Seaborn
plt.figure(figsize = (8, 5))
sns.lineplot(data = knn_scores_df, x = 'Neighbors', 
             y = 'Accuracy', hue = 'Type', marker='o')
plt.title('KNN: Varying Number of Neighbors')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xlim(1, 31)
plt.ylim(0.60, 0.90)
plt.grid(True)
plt.legend(title='Data Type')
plt.show()