In [1]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc
)
from sklearn.tree import export_graphviz
import xgboost as xgb
from xgboost import XGBClassifier as xgbclass
from scipy import stats
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    classification_report
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

root_path = "../../Data/GoogleDrive/"

### Parallel computing:

In [2]:
# Get the number of available CPU cores
num_cores = os.cpu_count()

print("Number of CPU cores available:", num_cores)

# Set number of cores
threads = os.cpu_count() - 2

print("Number of threads set to:", threads)

Number of CPU cores available: 8
Number of threads set to: 6


### Load in Data

In [3]:
X_train = pd.read_parquet(root_path + "X_train.parquet")
X_test = pd.read_parquet(root_path + "X_test.parquet")
y_train = pd.read_parquet(root_path + "y_train.parquet")
y_test = pd.read_parquet(root_path + "y_test.parquet")
data_id = "full" # For determining if full or selected dataset.

Converting target to 1D array for sklearn Random Forest:

In [4]:
y_train = y_train.values.ravel()

### backward selection for logistic regression
Uncomment to run backwards selection. Takes about 3 hours to run.

In [5]:
# from sklearn.feature_selection import SequentialFeatureSelector
# b_logi = LogisticRegression(max_iter = 300,
#                             solver = 'liblinear')
# back_selector = SequentialFeatureSelector(b_logi, direction = 'backward', 
#                                           scoring = 'f1')
# back_selector.fit(X_train, y_train)

# print(back_selector.k_feature_names_)

In [6]:
# saving the selection model
# with open(root_path + 'back_selection.pkl', 'wb') as file:
#     pickle.dump(back_selector, file)

### To use the backward selected features, uncomment tthe following code chunk
- and use X_train_selected and X_test_selected for all models

In [11]:
with open(root_path + "back_selection.pkl", "rb") as file:
    back_selector = pickle.load(file)

# Get the selected feature names
selected_features = back_selector.get_feature_names_out()
# Filter the feature names based on the selected features
#selected_feature_names = X_train.columns[selected_features]

# Naming Dataframes
data_id = "select"

# Transform the training and testing data using the selected feature names
X_train = X_train[selected_features]
X_test = X_test[selected_features]


## **Logistic Regression**

In [None]:
roc_auc_scorer = make_scorer(roc_auc_score,
                             needs_threshold = True,
                             multi_class = 'ovo')

# define a tuning grid for logistic regression
logi_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga'],
    'clf__class_weight': [None, 'balanced'],
}


# define a logistic regression model
log_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter = 1000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

# define a grid search with cross-validation
log_grid_search = GridSearchCV(estimator = log_pipe,
                               param_grid = logi_grid,
                               cv = cv,
                               scoring = roc_auc_scorer,
                               n_jobs = threads,
                               verbose = 0)

# fit the grid search
log_grid_search.fit(X_train, y_train)

# print the best parameters
print("Best parameters:", log_grid_search.best_params_)
print("Best cross-validation ROC AUC score: {:.2f}".format(log_grid_search.best_score_))



Best parameters: {'clf__C': 0.01, 'clf__class_weight': 'balanced', 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best cross-validation ROC AUC score: 0.84


In [None]:
# fit the training data with the best logistic regression model
log_final = log_grid_search.best_estimator_

y_pred_test_logi = log_final.predict(X_test)
y_pred_prob_logi = log_final.predict_proba(X_test)[:, 1]

In [None]:
with open(root_path + f"logit_model_{data_id}.pkl", 'wb') as file:
    pickle.dump(log_grid_search, file)

## **KNN**

In [None]:
knn_param_grid = {
    'knn__n_neighbors': np.arange(20,41,2),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

knn_grid_search = GridSearchCV(knn_pipe,
                               knn_param_grid,
                               cv = cv,
                               verbose = 0,
                               scoring = roc_auc_scorer,
                               n_jobs = threads)

knn_grid_search.fit(X_train, y_train)

print("Best parameters:", knn_grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(knn_grid_search.best_score_))

Best parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 30, 'knn__weights': 'distance'}
Best cross-validation score: 0.81


In [None]:
best_knn = knn_grid_search.best_estimator_
y_pred_test_knn = best_knn.predict(X_test)
y_pred_prob_knn = best_knn.predict_proba(X_test)[:, 1]

In [None]:
with open(root_path + f"knn_model_{data_id}.pkl", 'wb') as file:
    pickle.dump(knn_grid_search, file)

## **Random Forest**

In [None]:
start_time = time.time()

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=69)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [300, 600, 1000],
    'max_depth': [10, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='accuracy', 
                           n_jobs=threads) # Parallel

# Perform GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_rf_params = grid_search.best_params_
best_rf_score = grid_search.best_score_

print("Best Parameters:", best_rf_params)
print("Best Score:", best_rf_score)

# Use the best model to make predictions on the testing data
best_rf_classifier = grid_search.best_estimator_
rf_predictions = best_rf_classifier.predict(X_test)
rf_predcitions_prob = best_rf_classifier.predict_proba(X_test)[:, 1]

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")



In [None]:
# Save Model
with open(root_path + f"rf_model_{data_id}.pkl", 'wb') as file:
    pickle.dump(grid_search, file)

## **XGBoost**

In [None]:
start_time = time.time()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    # 'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, 
                           param_grid, 
                           cv=5, 
                           scoring='accuracy',
                           n_jobs=threads)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best set of hyperparameters and the corresponding score
xgb_best_params = grid_search.best_params_
xgb_best_score = grid_search.best_score_

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", xgb_best_params)
print("Best score: ", xgb_best_score)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Use the best model to make predictions on the testing data
xgb_predictions = best_xgb_model.predict(X_test)
xgb_predictions_prob = best_xgb_model.predict_proba(X_test)[:, 1]

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

In [None]:
# Save Model
with open(root_path + f"xgboost_model_{data_id}.pkl", 'wb') as file:
    pickle.dump(grid_search, file)

## DataFrames of Model Prediction and Probability

In [None]:
# # Dictionary to store best models
# best_models = {
#     'Random Forest': best_rf_classifier,
#     'XGBoost': best_xgb_model
# }

# # Dictionary to store predictions
# predictions = {
#     'Random Forest': rf_predictions,
#     'XGBoost': xgb_predictions
# }


# Save the best models and predictions
# pickle.dump(best_models, open(root_path + 'best_models.pkl', 'wb'))
# pickle.dump(predictions, open(root_path + 'predictions.pkl', 'wb'))

In [None]:
# save predicted values to a parquet file
out_dir = '../../Data/GoogleDrive/'
out_file = out_dir + 'full_test_predicted.parquet'

test_predicted = pd.DataFrame({'logi_predicted': y_pred_test_logi,
                               'logi_predicted_prob': y_pred_prob_logi,
                               'knn_predicted': y_pred_test_knn,
                               'knn_predicted_prob': y_pred_prob_knn,
                               'rf_predicted': rf_predictions,
                               'rf_predicted_prob': rf_predcitions_prob,
                               'xgb_predicted': xgb_predictions,
                               'xgb_predicted_prob': xgb_predictions_prob
                               })

test_predicted.to_parquet(out_file)