In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [2]:
# For testing of code, remove this afterwards, and add in the line to receive the processed data
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')

In [3]:
cols = raw_train.columns
X = raw_train[cols[3:-1]].drop(columns=['Gender', 'Geography'])
y = raw_train[
[cols[-1]]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
y_train_a, y_train_r, y_train_c, y_train_d = y_train, y_train, y_train, y_train
y_test_a, y_test_r, y_test_c, y_test_d = y_test, y_test, y_test, y_test

y_train_all = [y_train_a, y_train_r, y_train_c, y_train_d]
y_test_all = [y_test_a, y_test_r, y_test_c, y_test_d]

In [5]:
#Code to ignore the DataConversionWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

## Logistic Regression

In [6]:
lr_model = LogisticRegression()

In [6]:
# Hyperparameter Tuning
lr_param_dist = {
    'penalty': ['l1', 'l2'],  
    #'C' : np.logspace(-4, 4, 20),
    'C': [0.1, 1, 10, 100, 1000],  
    'solver': ['liblinear', 'saga'] 
}

#search_lr = RandomizedSearchCV(lr_model, param_distributions=lr_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

# Scale the data for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform optimization
#search_lr.fit(X_train_scaled, y_train)

#print("Best parameters found: ", search_lr.best_params_)

# Evaluate the best model on the test set
#best_model_lr = search_lr.best_estimator_
#lr_predictions = best_model_lr.predict(X_test_scaled)
#lr_probabilities = best_model_lr.predict_proba(X_test_scaled)[:,1]
#test_score_lr = best_model_lr.score(X_test_scaled, y_test)
#print("Test score of the best model: ", test_score_lr)


In [7]:
lr_models = []
lr_results = []

for i in range(4):
    lr_model = LogisticRegression()
    search_lr = RandomizedSearchCV(lr_model, param_distributions=lr_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

    search_lr.fit(X_train_scaled, y_train_all[i])

    best_model_lr = search_lr.best_estimator_
    lr_models.append(best_model_lr)
    
    test_score_lr = best_model_lr.score(X_test_scaled, y_test_all[i])
    lr_results.append(test_score_lr)

In [8]:
lr_results

[0.8233405035295543,
 0.8233405035295543,
 0.8233405035295543,
 0.8233405035295543]

## Gradient Booster

In [None]:
gb_model = GradientBoostingClassifier()

In [None]:
# Hyperparameter Tuning
gb_param_dist = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': np.arange(0., 2., 0.2).tolist(),
    'n_estimators': np.arange(100, 301, 100).tolist(),
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_leaf': np.arange(6,15, 2).tolist(),
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': np.arange(2,15, 2).tolist(),
    'max_depth': np.arange(3,16, 3).tolist()
}

#search = BayesSearchCV(
#    estimator=gb_model,
#    search_spaces=gb_param_dist,
#    n_iter=50,
#    cv=5
#)

search = RandomizedSearchCV(gb_model, param_distributions=gb_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)


# Perform optimization
search.fit(X_train, y_train)

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
gb_predictions = best_model.predict(X_test)
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)


In [5]:
xgb_model = xgb.XGBClassifier()

In [17]:
# Define the XGBoost parameter ranges
xgb_param_dist = {
    'booster': ['gbtree'],  
    'learning_rate': np.linspace(0.05, 0.3, 6),  
    'n_estimators': [100, 200],  
    'objective': ['multi:softmax'],  
    'num_class': [2],  
    'eval_metric': ['logloss'],  
    'max_depth': [3, 6, 9],  
    'min_child_weight': [1, 5, 10],  
    'gamma': [0, 0.1, 0.2],  
    'subsample': [0.6, 0.8, 1.0],  
    'colsample_bytree': [0.6, 0.8, 1.0],  
    'lambda': [0, 1, 2],  
    'alpha': [0, 1, 2]  
}

xgb_param_dist2 = {
    'booster': ['gbtree', 'dart'],  
    'learning_rate': np.linspace(0.05, 0.3, 6),  
    'n_estimators': [100, 200, 300],  
    'objective': ['binary:logistic', 'multi:softmax'],  
    'num_class': [2],
    'eval_metric': ['logloss'],  
    'max_depth': [3, 6, 9, 12],  
    'min_child_weight': [1, 5, 10],  
    'gamma': [0, 0.1, 0.2],  
    'subsample': [0.6, 0.8, 1.0],  
    'colsample_bytree': [0.6, 0.8],  
    'lambda': [0, 1, 2],  
    'alpha': [0, 1, 2]  
}

# Instantiate RandomizedSearchCV
#xgb_search = RandomizedSearchCV(xgb_model, param_distributions=xgb_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

# Perform optimization
#xgb_search.fit(X_train, y_train)

#print("Best parameters found: ", xgb_search.best_params_)

# Evaluate the best model on the test set
#best_xgb_model = xgb_search.best_estimator_
#xgb_predictions = best_xgb_model.predict(X_test)
#test_score = best_xgb_model.score(X_test, y_test)
#print("Test score of the best XGBoost model: ", test_score)

Best parameters found:  {'subsample': 1.0, 'objective': 'multi:softmax', 'num_class': 2, 'n_estimators': 100, 'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.1, 'lambda': 2, 'gamma': 0.1, 'eval_metric': 'logloss', 'colsample_bytree': 0.8, 'booster': 'gbtree', 'alpha': 2}
Test score of the best XGBoost model:  0.8563032084103372


In [None]:
xgb_models = []
xgb_results = []

for i in range(4):
    xgb_model = xgb.XGBClassifier()
    xgb_search = RandomizedSearchCV(xgb_model, param_distributions=xgb_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

    xgb_search.fit(X_train, y_train_all[i])

    best_model_xgb = xgb_search.best_estimator_
    xgb_models.append(best_model_xgb)
    
    test_score_xgb = best_model_xgb.score(X_test, y_test_all[i])
    xgb_results.append(test_score_xgb)

## Random Forest

In [5]:
rf_model = RandomForestClassifier()

In [6]:
# Hyperparameter Tuning
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(range(5, 16, 5)),
    'min_samples_leaf': list(range(1, 21, 5)),
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': list(range(2, 21, 5))
}
#search = BayesSearchCV(
    #estimator=rf_model,
    #search_spaces=rf_param_dist,
    #n_iter=50,
    #cv=5
#)
search = RandomizedSearchCV(rf_model, param_distributions=rf_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

# Perform optimization
search.fit(X_train, y_train)

#print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
#best_model = search.best_estimator_
#rf_predictions = best_model.predict(X_test)
#test_score = best_model.score(X_test, y_test)
#print("Test score of the best model: ", test_score)

Best parameters found:  {'n_estimators': 100, 'min_samples_split': 7, 'min_samples_leaf': 16, 'max_features': 'log2', 'max_depth': 15, 'criterion': 'entropy'}
Test score of the best model:  0.8553943102978157


In [None]:
rf_models = []
rf_results = []

for i in range(4):
    rf_model = RandomForestClassifier()
    search_rf = RandomizedSearchCV(rf_model, param_distributions=rf_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

    search_rf.fit(X_train, y_train_all[i])

    best_model_rf = search_rf.best_estimator_
    rf_models.append(best_model_rf)
    
    test_score_rf = best_model_rf.score(X_test, y_test_all[i])
    rf_results.append(test_score_rf)

## Send Code to The Next Person