In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

In [3]:
# For testing of code, remove this afterwards, and add in the line to receive the processed data
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')

In [4]:
cols = raw_train.columns
X = raw_train[cols[3:-1]].drop(columns=['Gender', 'Geography'])
y = raw_train[[cols[-1]]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
#Code to ignore the DataConversionWarning
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

## Logistic Regression

In [6]:
lr_model = LogisticRegression()

In [10]:
# Hyperparameter Tuning
lr_param_dist = {
    'penalty': ['l1', 'l2'],  
    #'C' : np.logspace(-4, 4, 20),
    'C': [0.1, 1, 10, 100, 1000],  
    'solver': ['liblinear', 'saga'] 
}

#search = BayesSearchCV(
    #estimator=lr_model,
    #search_spaces=lr_param_dist,
    #n_iter=50,
    #cv=5
#)
search_lr = RandomizedSearchCV(lr_model, param_distributions=lr_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)
# Scale the data for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform optimization
search_lr.fit(X_train_scaled, y_train)

print("Best parameters found: ", search_lr.best_params_)

# Evaluate the best model on the test set
best_model_lr = search_lr.best_estimator_
lr_predictions = best_model_lr.predict(X_test_scaled)
lr_probabilities = best_model_lr.predict_proba(X_test_scaled)[:,1]
test_score_lr = best_model_lr.score(X_test_scaled, y_test)
print("Test score of the best model: ", test_score_lr)


Best parameters found:  {'solver': 'saga', 'penalty': 'l2', 'C': 100}
Test score of the best model:  0.8233405035295543


## Gradient Booster

In [50]:
gb_model = GradientBoostingClassifier()

In [None]:
# Hyperparameter Tuning
gb_param_dist = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': np.arange(0., 2., 0.2).tolist(),
    'n_estimators': np.arange(100, 301, 100).tolist(),
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_leaf': np.arange(6,15, 2).tolist(),
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': np.arange(2,15, 2).tolist(),
    'max_depth': np.arange(3,16, 3).tolist()
}

#search = BayesSearchCV(
#    estimator=gb_model,
#    search_spaces=gb_param_dist,
#    n_iter=50,
#    cv=5
#)

search = RandomizedSearchCV(gb_model, param_distributions=gb_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)


# Perform optimization
search.fit(X_train, y_train)

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
gb_predictions = best_model.predict(X_test)
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)


## Random Forest

In [None]:
classifiers = []

In [None]:
for i in range(4):
    # Initialize Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100)
    
    # Train the classifier for the i-th label
    rf_classifier.fit(X_train, y_train)
    
    # Add the trained classifier to the list
    classifiers.append(rf_classifier)

In [None]:
probabilities = []

In [None]:
for classifier in classifiers:
    # Use predict_proba to get probability estimates
    proba = classifier.predict_proba(X_test)
    probabilities.append(proba)

In [None]:
rf_model = RandomForestClassifier()

In [None]:
# Hyperparameter Tuning
rf_param_dist = {
    'n_estimators': np.arange(100,201, 100).tolist(),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': np.arange(3,13, 3).tolist(),
    'min_samples_leaf': np.arange(6,21).tolist()[0::2],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': np.arange(2,11, 2).tolist()
}

#search = BayesSearchCV(
    #estimator=rf_model,
    #search_spaces=rf_param_dist,
    #n_iter=50,
    #cv=5
#)
search = RandomizedSearchCV(rf_model, param_distributions=rf_param_dist, n_iter=10, scoring='f1_weighted', cv=5, random_state=42)

# Perform optimization
search.fit(X_train, y_train)

print("Best parameters found: ", search.best_params_)

# Evaluate the best model on the test set
best_model = search.best_estimator_
rf_predictions = best_model.predict(X_test)
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)

## Ensemble & Classification Report

In [22]:
logistic_regression_predictions = [0, 1, 1, 0, 1]
xgboost_predictions = [1, 1, 1, 0, 0]
random_forest_predictions = [0, 0, 1, 0, 1]

# Combine predictions into a 2D array
all_predictions = np.vstack([logistic_regression_predictions, xgboost_predictions, random_forest_predictions])

# Calculate the majority vote
ensemble_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_predictions)

print("Ensemble Predictions:", ensemble_predictions)

Ensemble Predictions: [0 1 1 0 1]


In [24]:
lr_probabilities = [0.2, 0.8, 0.9, 0.4, 0.75]
xgboost_probabilities = [0.6, 0.53, 0.6, 0.21, 0.11]
random_forest_probabilities = [0.4, 0.16, 0.55, 0.3, 0.9]

# Combine probabilities into a 2D array
all_probabilities = np.vstack([lr_probabilities, xgboost_probabilities, random_forest_probabilities])

# Calculate the average probability for each class across all models
ensemble_probabilities = np.mean(all_probabilities, axis=0)

print("Ensemble Probabilities:", ensemble_probabilities)

Ensemble Probabilities: [0.4        0.49666667 0.68333333 0.30333333 0.58666667]
