# ML Training and Optimization

## Initial Setup

## Loading Full and Sample Data

## Machine Learning Hyperparameter Optimization and Full Training

### Logistic Regression

### Random Forest

### Neural Network

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import learning_curve, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

ML_columns = [column for column in loans_data.columns if loans_data[column].dtype == int or loans_data[column].dtype == float or loans_data[column].dtype == bool]

sample_data = loans_data.sample(10000)
# Defining inputs and outputs of dataset
X = sample_data[ML_columns].drop('loan_status_num', axis='columns').values
y = np.vstack(sample_data[ML_columns]['loan_status_num'].values)
# Splitting between trainnig and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of training inputs: {X_train.shape}")
print(f"Shape of testing inputs: {X_test.shape}")
print(f"Shape of training outputs: {y_train.shape}")
print(f"Shape of testing outputs: {y_test.shape}")

# Scaling the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(X)
X_train = scaler_X.transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_y.fit(y)
y_train = scaler_y.transform(y_train)
y_test = scaler_y.transform(y_test)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)



In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
logistic_model = LogisticRegression(class_weight='balanced', max_iter=1000, C=0.01)

# Train the model
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test)

train_sizes, train_scores, test_scores = learning_curve(
    logistic_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report
print(classification_report(y_test, y_pred))

coefficients = np.abs(logistic_model.coef_[0])

# Combining feature names and their corresponding coefficients
features, coefs = zip(*sorted(zip(ML_columns, coefficients), key=lambda x: x[1], reverse=True))

features = np.array(features)
coefs = np.array(coefs)

features = features[coefs > 0]
coefs = coefs[coefs > 0]

fig, ax = plt.subplots(1, 3, figsize = [30*1.62, 15])

cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalizing the confusion matrix

sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix')
ax[0].set_ylabel('Actual Labels')
ax[0].set_xlabel('Predicted Labels')

ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax[1].set_title('Learning Curve')
ax[1].set_xlabel('Training Examples')
ax[1].set_ylabel('Score')
ax[1].legend(loc="best")

sns.barplot(x=coefs, y=features, edgecolor="black", ax=ax[2])
ax[2].set_title("Correlation between Loan status and Numeric Features")
ax[2].set_xlabel('Correlation')
ax[2].set_ylabel('Numerical Features')
ax[2].tick_params(axis='y', labelsize=5)

fig.tight_layout()

In [None]:
import sys
import os

import warnings

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    # Since we're setting this at the sys level, it should not be overridden
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

import matplotlib
matplotlib.use('Agg')

print("Starting Hyperparameter optimization run for Logistic Model")

n_reps = 100
max_iter = 100000

print(f"Number of repetitions: {n_reps}")

param_dist = {
    'C': [10**i for i in np.linspace(-4, 4, num=100)],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l2', 'l1', 'elasticnet', None],
    'l1_ratio': np.linspace(0, 1, num=100),
}

logistic_model = LogisticRegression(class_weight='balanced', max_iter=max_iter)

random_search = RandomizedSearchCV(logistic_model, param_distributions=param_dist, n_iter=n_reps, cv=5, random_state=42, verbose=10, error_score=np.nan)

print("Fitting Random Search")
random_search.fit(X_train, y_train)

param_combination_details = []

for i, (params, mean_test_score) in enumerate(zip(random_search.cv_results_['params'], random_search.cv_results_['mean_test_score']), 1):
    try:
        print(f"Model ID: {i}")
        print(f"Parameters: {params}")

        current_model = random_search.best_estimator_.set_params(**params)

        # Calculate learning curve data
        train_sizes, train_scores, test_scores = learning_curve(
            current_model, X_train, y_train, cv=5, n_jobs=-1, 
            train_sizes=np.linspace(.1, 1.0, 5))

        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        current_model.fit(X_train, y_train)

        y_pred = current_model.predict(X_test)

        coefficients = np.abs(current_model.coef_[0])

        features, coefs = zip(*sorted(zip(ML_columns, coefficients), key=lambda x: x[1], reverse=True))

        features = np.array(features)
        coefs = np.array(coefs)

        features = features[coefs > 0]
        coefs = coefs[coefs > 0]
        
        cm = confusion_matrix(y_test, y_pred)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalizing the confusion matrix

        fig, ax = plt.subplots(1, 3, figsize = [30*1.62, 15])

        sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
        ax[0].set_title('Confusion Matrix')
        ax[0].set_ylabel('Actual Labels')
        ax[0].set_xlabel('Predicted Labels')

        ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
        ax[1].set_title('Learning Curve')
        ax[1].set_xlabel('Training Examples')
        ax[1].set_ylabel('Score')
        ax[1].legend(loc="best")

        sns.barplot(x=coefs, y=features, edgecolor="black", ax=ax[2])
        ax[2].set_title("Correlation between Loan status and Numeric Features")
        ax[2].set_xlabel('Correlation')
        ax[2].set_ylabel('Numerical Features')

        fig_path = f"../figures/LogisticRegression_{str(i).zfill(len(str(n_reps)))}.png"
        fig.tight_layout()
        fig.savefig(fig_path)
        plt.close(fig)
        param_combination_details.append({
            'id': i,
            'params': params,
            'accuracy': mean_test_score,
            'figure_path': fig_path
        })
        print(f"Accuracy:{mean_test_score}\n------------------------")
    except:
        print("Likely Bad Combination?")
param_combination_df = pd.DataFrame(param_combination_details)
param_combination_df.to_csv('../data/LogisticRegression_RandomizedSearchCV.csv', index=False)
print(f'FINAL RESULTS')
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

In [None]:
import matplotlib
matplotlib.use('Agg')
print("Starting Hyperparameter optimization run for Logistic Model")

n_reps = 1000
max_iter = 100000

print(f"Number of repetitions: {n_reps}")

# Define the parameter space for BayesSearchCV
# search_space = {
#     'C': Real(1e-5, 1e5, 'log-uniform'),
#     'solver': Categorical(['liblinear']),
#     'penalty': Categorical(['l2', 'l1']),
#     'l1_ratio': Real(0, 1)
# }
# best ~ 0.65

search_space = {
    'C': Real(1e-5, 1e5, 'log-uniform'),
    'solver': Categorical(['newton-cg']),
    'penalty': Categorical(['l2', None]),
    'l1_ratio': Real(0, 1)
}

def progress_reporter(optim_result):
    # This function can be customized to print or log the information you're interested in.
    # For instance, you can print the current iteration number:
    iteration = len(optim_result.x_iters)
    print(f"Iteration {iteration}/{n_reps}, Current best score: {-optim_result.fun}")

logistic_model = LogisticRegression(class_weight='balanced', max_iter=max_iter)

# Define BayesSearchCV
bayes_search = BayesSearchCV(logistic_model, search_space, n_iter=n_reps, cv=5, random_state=42, verbose=10, n_jobs=-1)

bayes_search.fit(X_train, y_train, callback=progress_reporter)

# Processing results
param_combination_details = []
for i, params in enumerate(bayes_search.cv_results_['params'], 1):
    print(f"Model ID: {i}")
    print(f"Parameters: {params}")

    current_model = bayes_search.best_estimator_.set_params(**params)

    # Calculate learning curve data
    train_sizes, train_scores, test_scores = learning_curve(
        current_model, X_train, y_train, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(.1, 1.0, 5))

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    current_model.fit(X_train, y_train)

    y_pred = current_model.predict(X_test)

    coefficients = np.abs(current_model.coef_[0])

    features, coefs = zip(*sorted(zip(ML_columns, coefficients), key=lambda x: x[1], reverse=True))

    features = np.array(features)
    coefs = np.array(coefs)

    features = features[coefs > 0]
    coefs = coefs[coefs > 0]

    cm = confusion_matrix(y_test, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalizing the confusion matrix

    fig, ax = plt.subplots(1, 3, figsize = [30*1.62, 15])

    sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
    ax[0].set_title('Confusion Matrix')
    ax[0].set_ylabel('Actual Labels')
    ax[0].set_xlabel('Predicted Labels')

    ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    ax[1].set_title('Learning Curve')
    ax[1].set_xlabel('Training Examples')
    ax[1].set_ylabel('Score')
    ax[1].legend(loc="best")

    sns.barplot(x=coefs, y=features, edgecolor="black", ax=ax[2])
    ax[2].set_title("Correlation between Loan status and Numeric Features")
    ax[2].set_xlabel('Correlation')
    ax[2].set_ylabel('Numerical Features')

    fig_path = f"../figures/LogisticRegression_BayesSearchCV_{str(i).zfill(len(str(n_reps)))}.png"
    fig.tight_layout()
    fig.savefig(fig_path)
    plt.close(fig)
    param_combination_details.append({
        'id': i,
        'params': params,
        'accuracy': np.mean(test_scores_mean),
        'figure_path': fig_path
    })
    print(f"Accuracy:{np.mean(test_scores_mean)}\n------------------------")
param_combination_df = pd.DataFrame(param_combination_details)
param_combination_df.to_csv('../data/LogisticRegression_BayesSearchCV.csv', index=False)
print(f'FINAL RESULTS')
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

In [None]:
import matplotlib
import sys
import os
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.metrics import make_scorer
matplotlib.use('Agg')
print("Starting Hyperparameter optimization run for Logistic Model")

import warnings

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    # Since we're setting this at the sys level, it should not be overridden
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses
    
n_reps = 100
max_iter = 100000

print(f"Number of repetitions: {n_reps}")

# Define the parameter space for BayesSearchCV
search_space = {
    'C': Real(1e-4, 1e4, 'log-uniform'),
    'solver': Categorical(['liblinear', 'saga']),
    'penalty': Categorical(['l2', 'l1', 'elasticnet']),
    'l1_ratio': Real(0, 1)
}

logistic_model = LogisticRegression(class_weight='balanced', max_iter=max_iter)

class CustomLogisticRegression(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, penalty='l2', max_iter=100, solver='lbfgs', l1_ratio=None):
        # Explicitly list parameters
        self.C = C
        self.penalty = penalty
        self.max_iter = max_iter
        self.solver = solver
        self.l1_ratio = l1_ratio

    def fit(self, X, y):
        # Use the parameters in LogisticRegression
        self.model = LogisticRegression(
            C=self.C, 
            penalty=self.penalty, 
            max_iter=self.max_iter, 
            solver=self.solver, 
            l1_ratio=self.l1_ratio
        )
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)
        params = self.model.get_params()

        # Define invalid combinations
        if params['solver'] == 'liblinear' and params['penalty'] == 'elasticnet':
            return -np.inf  # Large negative value for invalid combination
        if params['solver'] == 'liblinear' and params['penalty'] is None:
            return -np.inf
        if params['penalty'] == 'elasticnet' and params['solver'] != 'saga':
            return np.inf

        # Compute the score only if the combination is valid
        return accuracy_score(y, y_pred)

logistic_model_instance = CustomLogisticRegression()

# BayesSearchCV with custom scorer
bayes_search = BayesSearchCV(
    estimator=logistic_model_instance, 
    search_spaces=search_space, 
    n_iter=n_reps, 
    cv=5, 
    random_state=42, 
    n_jobs=-1
)

np.int = int
bayes_search.fit(X_train, y_train)

# Processing results
param_combination_details = []
for i, params in enumerate(bayes_search.cv_results_['params'], 1):
    try:
        print(f"Model ID: {i}")
        print(f"Parameters: {params}")

        current_model = bayes_search.best_estimator_.set_params(**params)

        # Calculate learning curve data
        train_sizes, train_scores, test_scores = learning_curve(
            current_model, X_train, y_train, cv=5, n_jobs=-1, 
            train_sizes=np.linspace(.1, 1.0, 5))

        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        current_model.fit(X_train, y_train)

        y_pred = current_model.predict(X_test)

        coefficients = np.abs(current_model.coef_[0])

        features, coefs = zip(*sorted(zip(ML_columns, coefficients), key=lambda x: x[1], reverse=True))

        features = np.array(features)
        coefs = np.array(coefs)

        features = features[coefs > 0]
        coefs = coefs[coefs > 0]
        
        cm = confusion_matrix(y_test, y_pred)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalizing the confusion matrix

        fig, ax = plt.subplots(1, 3, figsize = [30*1.62, 15])

        sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
        ax[0].set_title('Confusion Matrix')
        ax[0].set_ylabel('Actual Labels')
        ax[0].set_xlabel('Predicted Labels')

        ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
        ax[1].set_title('Learning Curve')
        ax[1].set_xlabel('Training Examples')
        ax[1].set_ylabel('Score')
        ax[1].legend(loc="best")

        sns.barplot(x=coefs, y=features, edgecolor="black", ax=ax[2])
        ax[2].set_title("Correlation between Loan status and Numeric Features")
        ax[2].set_xlabel('Correlation')
        ax[2].set_ylabel('Numerical Features')

        fig_path = f"../figures/LogisticRegression_BayesSearchCV_{str(i).zfill(len(str(n_reps)))}.png"
        fig.tight_layout()
        fig.savefig(fig_path)
        plt.close(fig)
        param_combination_details.append({
            'id': i,
            'params': params,
            'accuracy': mean_test_score,
            'figure_path': fig_path
        })
        print(f"Accuracy:{mean_test_score}\n------------------------")
    except:
        print("Likely Bad Combination?")
param_combination_df = pd.DataFrame(param_combination_details)
param_combination_df.to_csv('../data/LogisticRegression_BayesSearchCV.csv', index=False)
print(f'FINAL RESULTS')
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

In [None]:
import GPyOpt
from GPyOpt.methods import BayesianOptimization
import numpy as np

# Define the domain (parameter space)
domain = [{'name': 'C', 'type': 'continuous', 'domain': (1e-4, 1e4), 'transformation': 'log'},
          {'name': 'solver', 'type': 'categorical', 'domain': (0, 1)},
          {'name': 'penalty', 'type': 'categorical', 'domain': (0, 1, 2, 3)},
          {'name': 'l1_ratio', 'type': 'continuous', 'domain': (0, 1)}]

# Mapping for categorical variables
solver_mapping = {0: 'liblinear', 1: 'saga'}
penalty_mapping = {0: 'l2', 1: 'l1', 2: 'elasticnet', 3: None}


def model_score(model, X_train, X_test, y_train, y_test, cv=5):
    train_sizes, train_scores, test_scores = learning_curve(
    logistic_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(.1, 1.0, 5))

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    return test_scores_mean

def is_valid_combination(params):
    if params['solver'] == 'liblinear' and (params['penalty'] == 'elasticnet' or params['l1_ratio'] is not None):
        return False
    return True


# Objective function
def objective_function(x):
    params = {
        'C': x[0][0],
        'solver': solver_mapping[int(x[0][1])],
        'penalty': penalty_mapping[int(x[0][2])],
        'l1_ratio': x[0][3]
    }
    if not is_valid_combination(params):
        return np.nan
    model = LogisticRegression(class_weight='balanced', max_iter=max_iter, **params)
    return -model_score(model, X_train, X_test, y_train, y_test)  # Assuming lower is better

bo = BayesianOptimization(f=objective_function, domain=domain, model_type='GP', acquisition_type='EI', maximize=True)
bo.run_optimization(max_iter=50)

print(bo.x_opt)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForest model
rf_model = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Learning curve
train_sizes, train_scores, test_scores = learning_curve(
    rf_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Feature Importances
feature_importances = abs(rf_model.feature_importances_)

# Combining feature names and their corresponding importances
features, importances = zip(*sorted(zip(ML_columns, feature_importances), key=lambda x: x[1], reverse=True))

# Plotting
fig, ax = plt.subplots(1, 3, figsize=[15*1.62, 5])

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix')
ax[0].set_ylabel('Actual Labels')
ax[0].set_xlabel('Predicted Labels')

# Learning Curve
ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax[1].set_title('Learning Curve')
ax[1].set_xlabel('Training Examples')
ax[1].set_ylabel('Score')
ax[1].legend(loc="best")

# Feature Importances
sns.barplot(x=importances, y=features, edgecolor="black", ax=ax[2])
ax[2].set_title("Feature Importances")
ax[2].set_xlabel('Relative Importance')
ax[2].set_ylabel('Features')

fig.tight_layout()

In [None]:
from sklearn.neural_network import MLPClassifier

# Assuming X_train, X_test, y_train, y_test are already defined

# Initialize the MLPClassifier model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', 
                          max_iter=100, random_state=42)

# Train the model
mlp_model.fit(X_train, y_train)

# Make predictions
y_pred = mlp_model.predict(X_test)

# Learning curve
train_sizes, train_scores, test_scores = learning_curve(
    mlp_model, X_train, y_train, cv=5, train_sizes=np.linspace(.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plotting
fig, ax = plt.subplots(1, 2, figsize=[10*1.62, 5])  # Reduced to 1x2 layout

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix')
ax[0].set_ylabel('Actual Labels')
ax[0].set_xlabel('Predicted Labels')

# Learning Curve
ax[1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax[1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax[1].set_title('Learning Curve')
ax[1].set_xlabel('Training Examples')
ax[1].set_ylabel('Score')
ax[1].legend(loc="best")

fig.tight_layout()
