In [5]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc
)
from sklearn.tree import export_graphviz
import xgboost as xgb
from xgboost import XGBClassifier as xgbclass
from scipy import stats
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    classification_report
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

root_path = "../../Data/GoogleDrive/"

### Parallel computing:

In [12]:
# Get the number of available CPU cores
num_cores = os.cpu_count()

print("Number of CPU cores available:", num_cores)

# Set number of cores
jobs = os.cpu_count() - 4

print("Number of jobs set to:", jobs)

Number of CPU cores available: 12
Number of jobs set to: 8


### Load in Data

In [18]:
X_train = pd.read_parquet(root_path + "X_train.parquet")
X_test = pd.read_parquet(root_path + "X_test.parquet")
y_train = pd.read_parquet(root_path + "y_train.parquet")
y_test = pd.read_parquet(root_path + "y_test.parquet")

Converting target to 1D array for sklearn Random Forest:

In [23]:
y_train = y_train.values.ravel()

### backward selection for logistic regression

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
b_logi = LogisticRegression(max_iter = 300,
                            solver = 'liblinear')
back_selector = SequentialFeatureSelector(b_logi, direction = 'backward', 
                                          scoring = 'f1')
back_selector.fit(X_train, y_train)

print(back_selector.k_feature_names_)

In [None]:
# saving the selection model
pickle.dump(back_selector, open(root_path + 'back_selection.pkl', 'wb'))

## **Logistic Regression**

In [None]:
roc_auc_scorer = make_scorer(roc_auc_score,
                             needs_threshold = True,
                             multi_class = 'ovo')

# define a tuning grid for logistic regression
logi_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga'],
    'clf__class_weight': [None, 'balanced'],
}


# define a logistic regression model
log_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter = 1000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

# define a grid search with cross-validation
log_grid_search = GridSearchCV(estimator = log_pipe,
                               param_grid = logi_grid,
                               cv = cv,
                               scoring = roc_auc_scorer,
                               n_jobs = threads,
                               verbose = 0)

# fit the grid search
log_grid_search.fit(X_train, y_train)

# print the best parameters
print("Best parameters:", log_grid_search.best_params_)
print("Best cross-validation ROC AUC score: {:.2f}".format(log_grid_search.best_score_))

In [None]:
# fit the training data with the best logistic regression model
log_final = log_grid_search.best_estimator_

y_pred_test_logi = log_final.predict(X_test)
y_pred_prob_logi = log_final.predict_proba(X_test)[:, 1]

## **KNN**

In [None]:
from sklearn.pipeline import Pipeline


knn_param_grid = {
    'knn__n_neighbors': [1, 10, 20, 30],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 69)

knn_grid_search = GridSearchCV(knn_pipe,
                               knn_param_grid,
                               cv = cv,
                               verbose = 0,
                               scoring = roc_auc_scorer,
                               n_jobs = threads)

knn_grid_search.fit(X_train, y_train)

print("Best parameters:", knn_grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(knn_grid_search.best_score_))

In [None]:
best_knn = knn_grid_search.best_estimator_
y_pred_test_knn = best_knn.predict(X_test)
y_pred_prob_knn = best_knn.predict_proba(X_test)[:, 1]

## **Random Forest**

In [24]:
start_time = time.time()

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=69)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [1, 10, 20, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [2, 10, 20],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='accuracy', 
                           n_jobs=jobs) # Parallel

# Perform GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_rf_params = grid_search.best_params_
best_rf_score = grid_search.best_score_

print("Best Parameters:", best_rf_params)
print("Best Score:", best_rf_score)

# Use the best model to make predictions on the testing data
best_rf_classifier = grid_search.best_estimator_
rf_predictions = best_rf_classifier.predict(X_test)
rf_predcitions_prob = best_rf_classifier.predict_proba(X_test)[:, 1]

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

KeyboardInterrupt: 

In [None]:
# Save Model
pickle.dump(best_rf_classifier, open(root_path + 'rf_model.pkl', 'wb'))

## **XGBoost**

In [None]:
start_time = time.time()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    # 'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, 
                           param_grid, 
                           cv=5, 
                           scoring='accuracy',
                           n_jobs=jobs)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best set of hyperparameters and the corresponding score
xgb_best_params = grid_search.best_params_
xgb_best_score = grid_search.best_score_

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", xgb_best_params)
print("Best score: ", xgb_best_score)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Use the best model to make predictions on the testing data
xgb_predictions = best_xgb_model.predict(X_test)
xgb_predictions_prob = best_xgb_model.predict_proba(X_test)[:, 1]

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

In [None]:
# Save Model
pickle.dump(best_xgb_model, open(root_path + 'xgb_model.pkl', 'wb'))

## DataFrames of Model Prediction and Probability

In [None]:
# # Dictionary to store best models
# best_models = {
#     'Random Forest': best_rf_classifier,
#     'XGBoost': best_xgb_model
# }

# # Dictionary to store predictions
# predictions = {
#     'Random Forest': rf_predictions,
#     'XGBoost': xgb_predictions
# }


# Save the best models and predictions
# pickle.dump(best_models, open(root_path + 'best_models.pkl', 'wb'))
# pickle.dump(predictions, open(root_path + 'predictions.pkl', 'wb'))

In [None]:
# save predicted values to a parquet file
out_dir = '../../Data/GoogleDrive/'
out_file = out_dir + 'full_test_predicted.parquet'

test_predicted = pd.DataFrame({'logi_predicted': y_pred_test_logi,
                               'logi_predicted_prob': y_pred_prob_logi,
                               'knn_predicted': y_pred_test_knn,
                               'knn_predicted_prob': y_pred_prob_knn,
                               'rf_predicted': rf_predictions,
                               'rf_predicted_prob': rf_predcitions_prob,
                               'xgb_predicted': xgb_predictions,
                               'xgb_predicted_prob': xgb_predictions_prob
                               })

test_predicted.to_parquet(out_file)