In [None]:
!pip install keras
!pip install pandas
!pip install matplotlib
!pip install sklearn
!pip install seaborn
!pip install tensorflow
!pip install plotly

# Supervised ML Technique: Artificial Neural Networks


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow

import sklearn

## Iteration 1: Numerical Features

In [None]:
#Numerical Features
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('/Users/constanciasoares/Downloads/cleanLoanData.csv')


data.columns = data.columns.str.strip()

# Display dataset columns for verification
print("Dataset columns:")
print(data.columns)

X = data.drop(columns=['Profession', 'STATE', 'Income_Level', 'Age_Group'])

y = data['Risk_Flag']

print("\nFeatures (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X)
print(y)


In [None]:
# Visualize the distribution of the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x=y)
plt.title('Distribution of Risk_Flag (Target)')
plt.show()

# Print the class distribution
print(y.value_counts(normalize=True))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest to check feature importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Plot feature importances
feature_importances = rf_model.feature_importances_
features = X.columns

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=features)
plt.title('Feature Importance')
plt.show()
plt.savefig('importance.png')


In [None]:
num_features = X.shape[1]
print(num_features) #get number of features

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Sequential Model
model = Sequential()

# Build the model with hyperparameters (placeholders)
model.add(Dense(num_features,activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(39, activation="relu"))
model.add(Dropout(0.4))

model.add(Dense(19, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    epochs=100, 
    validation_data=(X_test, y_test),
    verbose=1
)


In [None]:
y_pred = model.predict(X_test, verbose=0)
print(y_pred)
binary_predictions = (y_pred >= 0.5).astype(int)
print(binary_predictions)

In [None]:
y_pred = model.predict(X_test, verbose=0)
print(y_pred)
binary_predictions = (y_pred >= 0.5).astype(int)
print(binary_predictions)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, binary_predictions)

# Calculate the AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
print("Training Accuracy:", history.history['accuracy'][-1])
print("Validation Accuracy:", history.history['val_accuracy'][-1])
print("Loss:", history.history['loss'][-1])

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

cm = confusion_matrix(y_test, binary_predictions)
    
# Create figure
plt.figure(figsize=(8, 6))
    
# Plot heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels='auto')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
    
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, binary_predictions, target_names= None))

## Iteration 2: Top 5 Most Important Features

In [None]:
#Top 5 Most Important Features
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/Users/constanciasoares/Downloads/cleanLoanData.csv')
data.columns = data.columns.str.strip()

# One-Hot Encode 
data_encoded = pd.get_dummies(data, columns=['STATE', 'Married.Single', 'House_Ownership', 'Age_Group', 'Income_Level' ], drop_first=True)


data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('Profession')]
data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('STATE')]

# Now we include all features, except 'Risk_Flag' and 'Profession since Profession is already one-hot encoded'
X = data[['Income','Age','Experience','CURRENT_JOB_YRS', 'ESI']]

y = data['Risk_Flag']  



# Verify the changes by printing the columns
print(data_encoded.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())


In [None]:
num_features = 66

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Sequential Model
model = Sequential()

# Build the model with hyperparameters (placeholders)
model.add(Dense(num_features,activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(39, activation="relu"))
model.add(Dropout(0.4))

model.add(Dense(19, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    epochs=100, 
    validation_data=(X_test, y_test),
    verbose=1
)


In [None]:
y_pred = model.predict(X_test, verbose=0)
print(y_pred)
binary_predictions = (y_pred >= 0.5).astype(int)
print(binary_predictions)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, binary_predictions)

# Calculate the AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


print("Training Accuracy:", history.history['accuracy'][-1])
print("Validation Accuracy:", history.history['val_accuracy'][-1])
print("Loss:", history.history['loss'][-1])

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

cm = confusion_matrix(y_test, binary_predictions)
    
# Create figure
plt.figure(figsize=(8, 6))
    
# Plot heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels='auto')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
    
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, binary_predictions, target_names= None))

## Iteration 3: Numerical Features

In [None]:
#All features 
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/Users/constanciasoares/Downloads/cleanLoanData.csv')
data.columns = data.columns.str.strip()

# One-Hot Encode 
data_encoded = pd.get_dummies(data, columns=['STATE', 'Married.Single', 'House_Ownership', 'Age_Group', 'Income_Level' ], drop_first=True)


data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('Profession')]
data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('STATE')]

# Now we include all features, except 'Risk_Flag' and 'Profession since Profession is already one-hot encoded'
X = data_encoded.drop(columns=['Risk_Flag'])

y = data['Risk_Flag']  



# Verify the changes by printing the columns
print(data_encoded.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Sequential Model
model = Sequential()

# Build the model with hyperparameters (placeholders)
model.add(Dense(num_features,activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(39, activation="relu"))
model.add(Dropout(0.4))

model.add(Dense(19, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    epochs=100, 
    validation_data=(X_test, y_test),
    verbose=1
)


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, binary_predictions)

# Calculate the AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


print("Training Accuracy:", history.history['accuracy'][-1])
print("Validation Accuracy:", history.history['val_accuracy'][-1])
print("Loss:", history.history['loss'][-1])

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

cm = confusion_matrix(y_test, binary_predictions)
    
# Create figure
plt.figure(figsize=(8, 6))
    
# Plot heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels='auto')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
    
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, binary_predictions, target_names= None))

# Supervised ML Technique: Random Forests

### Data Set Import

In [None]:
#All features 
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/Users/constanciasoares/Downloads/cleanLoanData.csv')
data.columns = data.columns.str.strip()

# One-Hot Encode 
data_encoded = pd.get_dummies(data, columns=['STATE', 'Married.Single', 'House_Ownership', 'Age_Group', 'Income_Level' ], drop_first=True)


data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('Profession')]
data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('STATE')]

# Now we include all features, except 'Risk_Flag' and 'Profession since Profession is already one-hot encoded'
X = data_encoded.drop(columns=['Risk_Flag'])

y = data['Risk_Flag']  



# Verify the changes by printing the columns
print(data_encoded.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())


### Iteration 1: Baseline Model With All Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators=100)

# Train the model
rfmodel.fit(X_train, y_train)

# Make predictions on the test set (X_test, not X_cv)
rfpredictions = rfmodel.predict(X_test)

# Print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, rfpredictions))

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, rfpredictions))

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rfpredictions))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, rfpredictions)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Risk', 'Risky'], yticklabels=['No Risk', 'Risky'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rfmodel = RandomForestClassifier(n_estimators=200, random_state=42)  # Create the model instance
rfmodel.fit(X_train, y_train)  # Train the model with the training data

rfprobs = rfmodel.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

fpr, tpr, thresholds = roc_curve(y_test, rfprobs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='green', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()


### Iteration 2: Hyperparameter Tuning with Top 5 Features

In [None]:
#Top 5 Most Important Features
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/Users/constanciasoares/Downloads/cleanLoanData.csv')
data.columns = data.columns.str.strip()

# One-Hot Encode 
data_encoded = pd.get_dummies(data, columns=['STATE', 'Married.Single', 'House_Ownership', 'Age_Group', 'Income_Level' ], drop_first=True)


data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('Profession')]
data_encoded = data_encoded.loc[:, ~data_encoded.columns.str.startswith('STATE')]

# Now we include all features, except 'Risk_Flag' and 'Profession since Profession is already one-hot encoded'
X = data[['Income','Age','Experience','CURRENT_JOB_YRS', 'ESI']]

y = data['Risk_Flag']  



# Verify the changes by printing the columns
print(data_encoded.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())


In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution for randomized search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV instead of GridSearchCV for faster computation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist,
                                   n_iter=10, cv=3, n_jobs=-1, random_state=42, verbose=2)

# Fit the model with randomized search to find the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Print the best parameters
print("Best Hyperparameters from RandomizedSearchCV:")
print(best_params)

# Make predictions on the test set using the best model
rf_predictions = best_model.predict(X_test)

# Print accuracy score
print("\nAccuracy Score:")
print(accuracy_score(y_test, rf_predictions))

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

# Predict probabilities for the positive class (class 1)
y_prob = best_model.predict_proba(X_test)[:, 1]  # probabilities for class 1

# Compute ROC curve and AUC (Area Under the Curve)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


### Iteration 3: Ensemble Approach (Bagging) with Full Features

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

# Create a Bagging model with Decision Trees as base learners
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), 
                                 n_estimators=350, random_state=42)

# Train the model
bagging_model.fit(X_train, y_train)

y_prob = bagging_model.predict_proba(X_test)[:, 1]  # probabilities for class 1

# Compute ROC curve and AUC (Area Under the Curve)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='green', lw=2, label=f'ROC curve (area = {roc_auc:.5f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Iteration 3: Ensemble Approach (Bagging) with Full Features')
plt.legend(loc="lower right")
plt.show()
