<a href="https://colab.research.google.com/github/MichaelRDionne/Caltech-AI-Machine-Learning-Bootcamp/blob/main/Deep_Learning_Keras_end_of_course_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [93]:
#Create a model that predicts whether or not a loan will be default using the historical data.
#Perform data preprocessing and build a deep learning prediction model.

In [94]:
!pip install scikeras


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [95]:
#imports
import pandas as pd
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasRegressor


# Load the dataset
loan_data_df = pd.read_csv('loan_data.csv')


In [96]:
#review the dataframe
loan_data_df.head()


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [97]:
#review the data types
loan_data_df.dtypes


credit.policy          int64
purpose               object
int.rate             float64
installment          float64
log.annual.inc       float64
dti                  float64
fico                   int64
days.with.cr.line    float64
revol.bal              int64
revol.util           float64
inq.last.6mths         int64
delinq.2yrs            int64
pub.rec                int64
not.fully.paid         int64
dtype: object

In [98]:
# Create a list of categorical variables 
categorical_variables = list (loan_data_df.dtypes[loan_data_df.dtypes == 'object'].index)

# Display the categorical variables list
categorical_variables

['purpose']

In [99]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output=False)


In [100]:
# Encode the categorical variables using OneHotEncoder
encoded_loan_data =  enc.fit_transform(loan_data_df[categorical_variables])

In [101]:
# Define the numerical variables list
numerical_variables = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid']

# Create a DataFrame with the encoded variables
encoded_loan_df = pd.DataFrame(
    encoded_loan_data,
    columns=enc.get_feature_names_out(categorical_variables)
)

# Merge the encoded categorical variables with the numerical variables
loan_data_df = pd.concat([loan_data_df[numerical_variables], encoded_loan_df], axis=1)

# Calculate the correlation matrix
correlation_matrix = loan_data_df.corr()

# Set the threshold value
correlation_threshold = 0.8

# Iterate through the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname = correlation_matrix.columns[i]
            loan_data_df.drop(colname, axis=1, inplace=True)

# Review the DataFrame
loan_data_df.head()


Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [102]:
# Define the target set 'y' by selecting the 'not.fully.paid' column
y =loan_data_df['not.fully.paid']

#display a sample of y
y[:5]


0    0
1    0
2    0
3    0
4    0
Name: not.fully.paid, dtype: int64

In [103]:
# Define the features set 'X' by selecting all columns except 'not.fully.paid'
X = loan_data_df.drop(columns=['not.fully.paid'])

#display a sample of X
X[:5]

Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# Assuming your dataset is a pandas DataFrame named 'encoded_loan_df'
X = loan_data_df.drop('not.fully.paid', axis=1)
y = loan_data_df['not.fully.paid']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing sets
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

def create_model(learning_rate, layer1_units, layer1_dropout, layer2_units, layer2_dropout, l1_l2_regularizer):
    model = Sequential()
    model.add(Dense(layer1_units, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=L1L2(l1=l1_l2_regularizer, l2=l1_l2_regularizer)))
    model.add(Dropout(layer1_dropout))
    model.add(Dense(layer2_units, activation='relu', kernel_regularizer=L1L2(l1=l1_l2_regularizer, l2=l1_l2_regularizer)))
    model.add(Dropout(layer2_dropout))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model creation function with KerasRegressor
model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=lambda: create_model(learning_rate=0.001, layer1_units=128, layer1_dropout=0.1, layer2_units=64, layer2_dropout=0.1, l1_l2_regularizer=0.001), verbose=0)
# Create a GridSearchCV object
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

# Perform the grid search
grid_result = grid.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Train the final model using the best parameters
best_params = grid_result.best_params_

final_model = create_model(
    learning_rate=best_params['learning_rate'],
    layer1_units=best_params['layer1_units'],
    layer1_dropout=best_params['layer1_dropout'],
    layer2_units=best_params['layer2_units'],
    layer2_dropout=best_params['layer2_dropout'],
    l1_l2_regularizer=best_params['l1_l2_regularizer']
)

final_model.fit(
    X_train_scaled, y_train,
    epochs=best_params['epochs'],
    batch_size=best_params['batch_size'],
    verbose=1
)



  model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=lambda: create_model(learning_rate=0.001, layer1_units=128, layer1_dropout=0.1, layer2_units=64, layer2_dropout=0.1, l1_l2_regularizer=0.001), verbose=0)


ValueError: ignored