In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the normalized DataFrame from the CSV file
normalized_df = pd.read_csv('BLA.csv')

# Specify target column name
target_column = 'Default'  # Replace 'Default' with the name of your target column

# Split the data into features (X) and target variable (y)
X = normalized_df.drop(columns=[target_column])
y = normalized_df[target_column]

# Add constant to features for the intercept term
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the summary of the regression model
regression_summary = model.summary()

# Display the regression summary
print("Regression Summary:")
print(regression_summary)


Regression Summary:
                            OLS Regression Results                            
Dep. Variable:                Default   R-squared:                       0.082
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     989.8
Date:                Tue, 09 Apr 2024   Prob (F-statistic):               0.00
Time:                        14:27:25   Log-Likelihood:                -60753.
No. Observations:              255327   AIC:                         1.216e+05
Df Residuals:                  255303   BIC:                         1.218e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

## Gradient Boosting

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import randint
import multiprocessing

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Apply SMOTE to the training data only
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': [0.1, 0.05, 0.01]
}

# Initialize Gradient Boosting classifier
classifier = GradientBoostingClassifier(random_state=42)

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=classifier, param_distributions=param_grid, n_iter=5, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Make predictions on the validation data using the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)

# Generate classification report
report = classification_report(y_val, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 152}
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     45188
           1       0.93      0.88      0.90     45084

    accuracy                           0.91     90272
   macro avg       0.91      0.91      0.91     90272
weighted avg       0.91      0.91      0.91     90272

Confusion Matrix:
[[42340  2848]
 [ 5523 39561]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_model.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


## Cat Boost Classifier


The CatBoost Classifier is a gradient boosting algorithm specifically designed to work well with categorical features. It's an efficient implementation of gradient boosting for decision trees and is particularly useful for datasets with categorical features and large numbers of observations. CatBoost handles categorical features internally without the need for preprocessing like one-hot encoding.



In [None]:
!pip install catboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoost Classifier
catboost = CatBoostClassifier()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128],
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=catboost, param_distributions=param_dist, n_iter=20, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_catboost = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_catboost.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


0:	learn: 0.6094321	total: 61.5ms	remaining: 18.4s
1:	learn: 0.5420694	total: 122ms	remaining: 18.1s
2:	learn: 0.4926297	total: 179ms	remaining: 17.7s
3:	learn: 0.4565312	total: 253ms	remaining: 18.7s
4:	learn: 0.4273003	total: 315ms	remaining: 18.6s
5:	learn: 0.4034218	total: 371ms	remaining: 18.2s
6:	learn: 0.3854213	total: 427ms	remaining: 17.9s
7:	learn: 0.3707935	total: 482ms	remaining: 17.6s
8:	learn: 0.3603769	total: 542ms	remaining: 17.5s
9:	learn: 0.3509116	total: 602ms	remaining: 17.5s
10:	learn: 0.3441678	total: 656ms	remaining: 17.2s
11:	learn: 0.3388437	total: 713ms	remaining: 17.1s
12:	learn: 0.3340740	total: 774ms	remaining: 17.1s
13:	learn: 0.3304166	total: 834ms	remaining: 17s
14:	learn: 0.3275213	total: 891ms	remaining: 16.9s
15:	learn: 0.3249003	total: 984ms	remaining: 17.5s
16:	learn: 0.3229320	total: 1.08s	remaining: 18s
17:	learn: 0.3209652	total: 1.22s	remaining: 19s
18:	learn: 0.3194361	total: 1.33s	remaining: 19.6s
19:	learn: 0.3182188	total: 1.46s	remaining: 2

In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Make predictions for new loan applications
new_predictions = best_catboost.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## DT


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import multiprocessing
from sklearn.tree import DecisionTreeClassifier

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Initialize Decision Tree classifier
dt = DecisionTreeClassifier()

# Apply SMOTE to the training data only
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Evaluate model on validation set using best parameters
best_dt = random_search.best_estimator_
y_pred_val = best_dt.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)

Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None, 'criterion': 'entropy'}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.86      0.87      0.86     45188
           1       0.87      0.85      0.86     45084

    accuracy                           0.86     90272
   macro avg       0.86      0.86      0.86     90272
weighted avg       0.86      0.86      0.86     90272


Confusion Matrix:
[[39193  5995]
 [ 6617 38467]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_dt.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)



Predictions for new loan applications: [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## DT + AdaBoost



The AdaBoost Classifier is another popular ensemble learning method that builds a strong classifier by combining multiple weak classifiers. It works by sequentially adding weak learners to the ensemble, with each one correcting the errors made by its predecessors. AdaBoost is particularly effective for binary classification tasks and is capable of capturing complex decision boundaries.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base classifier (decision tree)
base_classifier = DecisionTreeClassifier()

# Initialize AdaBoost Classifier with the base classifier
adaboost = AdaBoostClassifier(base_estimator=base_classifier)

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=adaboost, param_distributions=param_dist, n_iter=20, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_adaboost = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_adaboost.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)




Best Parameters: {'n_estimators': 100, 'learning_rate': 1.0}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     45050
           1       0.20      0.23      0.21      6016

    accuracy                           0.80     51066
   macro avg       0.55      0.55      0.55     51066
weighted avg       0.81      0.80      0.81     51066


Confusion Matrix:
[[39489  5561]
 [ 4636  1380]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_adaboost.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)


Predictions for new loan applications: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


## NB random


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from scipy.stats import randint
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Naive Bayes classifier (MultinomialNB)
nb = MultinomialNB()

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=nb, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data (No need to tune hyperparameters for Naive Bayes)
random_search.fit(X_train, y_train)

# Print the best parameters found (not applicable for Naive Bayes)
print("Best Parameters:", random_search.best_params_)

# Get the best model (No hyperparameters to tune for Naive Bayes)
best_nb = random_search.best_estimator_

# Evaluate model on validation set (No hyperparameters to tune for Naive Bayes)
y_pred_val = best_nb.predict(X_val)
print("Validation Set Performance:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)





Best Parameters: {}
Validation Set Performance:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     45050
           1       0.00      0.00      0.00      6016

    accuracy                           0.88     51066
   macro avg       0.44      0.50      0.47     51066
weighted avg       0.78      0.88      0.83     51066


Confusion Matrix:
[[45050     0]
 [ 6016     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_nb.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

## Naive Bayes


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
import multiprocessing
import pandas as pd

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Apply SMOTE to the training data only
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Fit the classifier to the training data
classifier.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = classifier.predict(X_val)

# Generate classification report
report = classification_report(y_val, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.71      0.72     45188
           1       0.72      0.75      0.73     45084

    accuracy                           0.73     90272
   macro avg       0.73      0.73      0.73     90272
weighted avg       0.73      0.73      0.73     90272

Confusion Matrix:
[[32151 13037]
 [11356 33728]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_adaboost.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


## LR edited


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import uniform

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Apply standardization to the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
logreg = LogisticRegression()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'C': uniform(loc=0.01, scale= 10)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=logreg, param_distributions=param_dist, n_iter=20, cv=5, scoring='f1', random_state=42)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_logreg = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_logreg.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)


Best Parameters: {'C': 1.49816047538945}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45050
           1       0.62      0.04      0.07      6016

    accuracy                           0.88     51066
   macro avg       0.75      0.52      0.50     51066
weighted avg       0.85      0.88      0.84     51066


Confusion Matrix:
[[44914   136]
 [ 5796   220]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Apply standardization to the new applicants data
new_applicants_scaled = scaler.transform(new_applicants)

# Make predictions for new loan applications
new_predictions = logreg.predict(new_applicants_scaled)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)


## Logistic Regression


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression classifier
classifier = LogisticRegression()

# Fit the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data probabilities
y_prob = classifier.predict_proba(X_test)

# Manually adjust the threshold (for example, to 0.3)
threshold = 0.08

y_pred_adjusted = (y_prob[:, 1] >= threshold).astype(int)

# Generate classification report
report = classification_report(y_test, y_pred_adjusted)
print("Classification Report with Threshold Adjustment:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)
print("\nConfusion Matrix with Threshold Adjustment:")
print(conf_matrix)


Classification Report with Threshold Adjustment:
              precision    recall  f1-score   support

           0       0.95      0.51      0.67     45050
           1       0.18      0.81      0.30      6016

    accuracy                           0.55     51066
   macro avg       0.57      0.66      0.48     51066
weighted avg       0.86      0.55      0.62     51066


Confusion Matrix with Threshold Adjustment:
[[23118 21932]
 [ 1136  4880]]
[[23118 21932]
 [ 1136  4880]]


In [None]:
# Preprocess new loan applications
new_applicants = pd.read_csv("NewApplicants.csv")
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

## LR edited++

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import uniform
from imblearn.over_sampling import SMOTE

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Apply standardization to the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handling Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
logreg = LogisticRegression()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'C': uniform(loc=0.01, scale=10)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=logreg, param_distributions=param_dist, n_iter=20, cv=5, scoring='f1', random_state=42)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_logreg = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_logreg.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)


Best Parameters: {'C': 3.7554011884736247}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70     45188
           1       0.69      0.72      0.71     45084

    accuracy                           0.70     90272
   macro avg       0.70      0.70      0.70     90272
weighted avg       0.70      0.70      0.70     90272


Confusion Matrix:
[[30997 14191]
 [12771 32313]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]


## RF edited


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from scipy.stats import randint
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf = RandomForestClassifier()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'n_estimators': [int(x) for x in range(100, 1000, 100)],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_rf = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_rf.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("\nConfusion Matrix:")
print(conf_matrix)



Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45050
           1       0.64      0.05      0.09      6016

    accuracy                           0.88     51066
   macro avg       0.76      0.52      0.51     51066
weighted avg       0.86      0.88      0.84     51066


Confusion Matrix:
[[44885   165]
 [ 5726   290]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_rf.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTE for over-sampling only the minority class (positive class)
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to the training data only
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the resampled training data
classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88     45050
           1       0.25      0.37      0.30      6016

    accuracy                           0.79     51066
   macro avg       0.58      0.61      0.59     51066
weighted avg       0.83      0.79      0.81     51066


Confusion Matrix:
[[38226  6824]
 [ 3774  2242]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]


## RF GridSearch


latest runtime: 4hrs

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTE for over-sampling only the minority class (positive class)
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to the training data only
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200],  # Reduced number of estimators
    'max_depth': [5, 10],  # Reduced depth
    'min_samples_split': [2, 5],  # Reduced number of splits
    'min_samples_leaf': [1]  # Kept only one value for leaf samples
}

# Perform Grid Search with cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_

# Make predictions on the test data
y_pred = best_rf_model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.72      0.81     45050
           1       0.22      0.59      0.32      6016

    accuracy                           0.70     51066
   macro avg       0.57      0.65      0.56     51066
weighted avg       0.84      0.70      0.75     51066


Confusion Matrix:
[[32345 12705]
 [ 2492  3524]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]


## Svm

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Load the modified DataFrame
normalized_df = pd.read_csv('BLA.csv')

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=['Default'])  # Features
y = normalized_df['Default']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTE for over-sampling only the minority class (positive class)
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to the training data only
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize Support Vector Machine (SVM) classifier
classifier = SVC(kernel='rbf', random_state=42)

# Fit the classifier on the resampled training data
classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.72      0.81     45050
           1       0.22      0.58      0.31      6016

    accuracy                           0.70     51066
   macro avg       0.57      0.65      0.56     51066
weighted avg       0.84      0.70      0.75     51066


Confusion Matrix:
[[32261 12789]
 [ 2502  3514]]


In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]


## **SVM_edited**

takes too long, record: 13 hours runtime disconnect

my soul died a little running this


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from scipy.stats import randint
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier
svm = SVC()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [2],
    'coef0': [0.0]
}
param_dist_reduced = {
    'C': [0.1, 1, 10],  # Reduced range of values for C
    'kernel': ['linear', 'rbf'],  # Focus on simpler kernels
    'gamma': ['scale'],  # Only one option for gamma
    'degree': [2],  # Only one degree for polynomial kernel
    'coef0': [0.0]  # Only one value for coef0
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_dist, n_iter=4, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_svm = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_svm.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = classifier.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

## Neural Network


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import multiprocessing

# Load data
normalized_df = pd.read_csv('BLA.csv')

# Check for and drop rows with missing target values
normalized_df = normalized_df.dropna(subset=['Default'])

# Define the features (X) and target variable (y)
X = normalized_df.drop(columns=["Default"])  # Features
y = normalized_df["Default"]  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Multi-layer Perceptron (Neural Network) classifier
mlp = MLPClassifier()

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'hidden_layer_sizes': [(100,), (50, 50), (100, 50), (50, 25)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

# Initialize RandomizedSearchCV with parallel processing
n_cores = multiprocessing.cpu_count()
random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist, n_iter=20, cv=3, scoring='f1', random_state=42, n_jobs=n_cores)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_mlp = random_search.best_estimator_

# Evaluate model on validation set using best parameters
y_pred_val = best_mlp.predict(X_val)
print("Validation Set Performance with Best Parameters:")
print(classification_report(y_val, y_pred_val))




Best Parameters: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 25), 'alpha': 0.0001, 'activation': 'tanh'}
Validation Set Performance with Best Parameters:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45050
           1       0.50      0.10      0.17      6016

    accuracy                           0.88     51066
   macro avg       0.70      0.54      0.55     51066
weighted avg       0.85      0.88      0.85     51066



In [None]:
# Load new loan applications data
new_applicants = pd.read_csv("NewApplicants.csv")

# Drop the 'Default' column if it exists in the new applicants dataset
if 'Default' in new_applicants.columns:
    new_applicants = new_applicants.drop(columns=['Default'])

# Make predictions for new loan applications
new_predictions = best_mlp.predict(new_applicants)

# Print predictions for new loan applications
print("Predictions for new loan applications:", new_predictions)

Predictions for new loan applications: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
