In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
from sklearn.tree import export_graphviz
from sklearn.utils.class_weight import compute_class_weight
from pydotplus import graph_from_dot_file
import os
import time
import xgboost as xgb
from xgboost import XGBClassifier as xgbclass
from xgboost import plot_tree
from scipy import stats
import pickle

root_path = "../../Data/GoogleDrive/"

### Parallel computing:

In [7]:
# Get the number of available CPU cores
num_cores = os.cpu_count()

print("Number of CPU cores available:", num_cores)

# Set number of cores
jobs = os.cpu_count() - 4

print("Number of jobs set to:", jobs)

Number of CPU cores available: 12
Number of jobs set to: 8


### Load in Data

In [8]:
X_train = pd.read_parquet (root_path + "X_train.parquet")
X_test = pd.read_parquet (root_path + "X_test.parquet")
y_train = pd.read_parquet (root_path + "y_train.parquet")
y_test = pd.read_parquet (root_path + "y_test.parquet")

## **Logistic Regression**

## **KNN**

## **Random Forest**

In [4]:
start_time = time.time()

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=69)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [1, 10, 20, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [2, 10, 20],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='accuracy', 
                           n_jobs=jobs) # Parallel

# Perform GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_rf_params = grid_search.best_params_
best_rf_score = grid_search.best_score_

print("Best Parameters:", best_rf_params)
print("Best Score:", best_rf_score)

# Use the best model to make predictions on the testing data
best_rf_classifier = grid_search.best_estimator_
rf_predictions = best_rf_classifier.predict(X_test)

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

NameError: name 'jobs' is not defined

Save the model:

In [None]:
pickle.dump(best_rf_classifier, open(root_path + 'rf_model.pkl', 'wb'))

## **XGBoost**

In [None]:
start_time = time.time()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    # 'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, 
                           param_grid, 
                           cv=5, 
                           scoring='accuracy',
                           n_jobs=jobs)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best set of hyperparameters and the corresponding score
xgb_best_params = grid_search.best_params_
xgb_best_score = grid_search.best_score_

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", xgb_best_params)
print("Best score: ", xgb_best_score)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Use the best model to make predictions on the testing data
xgb_predictions = best_xgb_model.predict(X_test)

end_time = time.time()

execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

Save the model:

In [None]:
pickle.dump(best_xgb_model, open(root_path + 'xgb_model.pkl', 'wb'))