In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import seaborn as sns
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler

In [57]:
# For testing of code, remove this afterwards, and add in the line to receive the processed data
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')

In [58]:
cols = raw_train.columns
X = raw_train[cols[3:-1]].drop(columns=['Gender', 'Geography'])
y = raw_train[[cols[-1]]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [37]:
lr_model = LogisticRegression()

In [None]:
# Hyperparameter Tuning
lr_param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

search = BayesSearchCV(
    estimator=lr_model,
    search_spaces=lr_param_dist,
    n_iter=50,
    cv=5
)

# Scale the data for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform optimization
search.fit(X_train, y_train)

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)


In [None]:
predict_proba(X)

## Gradient Booster

In [60]:
gb_model = GradientBoostingClassifier()

In [None]:
# Hyperparameter Tuning
gb_param_dist = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': np.arange(0., 5., 0.2).tolist(),
    'n_estimators': np.arange(100, 501, 100).tolist(),
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_leaf': np.arange(6,21, 2).tolist(),
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': np.arange(2,21, 2).tolist(),
    'max_depth': np.arange(3,22, 3).tolist()
}

#search = BayesSearchCV(
#    estimator=gb_model,
#    search_spaces=gb_param_dist,
#    n_iter=50,
#    cv=5
#)

search = RandomizedSearchCV(gb_model, param_distributions=gb_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

# Perform optimization
search.fit(X_train, y_train.ravel())

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)


## Random Forest

In [4]:
rf_model = RandomForestClassifier()

In [None]:
# Hyperparameter Tuning
rf_param_dist = {
    'n_estimators': np.arange(100,1001, 100).tolist(),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': np.arange(3,22, 3).tolist(),
    'min_samples_leaf': np.arange(6,21).tolist()[0::2],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': np.arange(2,21, 2).tolist()
}

search = BayesSearchCV(
    estimator=rf_model,
    search_spaces=rf_param_dist,
    n_iter=50,
    cv=5
)

# Perform optimization
search.fit(X_train, y_train)

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)