In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Read Prep Data

In [20]:
# df_selected_small.to_parquet(r"C:\Users\Forcessofnature\Downloads\df_small.parquet")
data_encoded = pd.read_parquet(r"C:\Users\Forcessofnature\Downloads\df_small_encoded.parquet")

## Set up Features and Target

In [21]:
# Step 4: Prepare Data for Logistic Regression
X = data_encoded.drop(['application_approved'], axis=1)  # Features
y = data_encoded['application_approved']  # Target variable

In [22]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
#fill any nan values in y with 0
y_train = y_train.fillna(0)

#any nan values in X_train fill with 0
X_train = X_train.fillna(0)

#any nan values in X_train fill with 0
X_train = X_train.fillna(0)

#any nan values in X_test fill with 0
X_test = X_test.fillna(0)


In [24]:
from sklearn.linear_model import LogisticRegression

# Define and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions
y_pred = model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix

              precision    recall  f1-score   support

           0       0.25      0.01      0.02    153621
           1       0.89      1.00      0.94   1245634

    accuracy                           0.89   1399255
   macro avg       0.57      0.50      0.48   1399255
weighted avg       0.82      0.89      0.84   1399255



In [26]:
# Extract coefficients and corresponding feature names
coefficients = model.coef_[0]
feature_names = X.columns

# Create a DataFrame to display coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort coefficients by absolute value to identify significant variables
coefficients_df['Abs_Coefficient'] = abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display coefficients and feature names
print(coefficients_df)

                                              Feature   Coefficient  \
52                                      state_code_TX -4.202863e-01   
4                                          income_log  3.843229e-01   
50        race_ethnicity_White|Not Hispanic or Latino  3.533839e-01   
57                                 activity_year_2022 -3.332337e-01   
43  race_ethnicity_Race Not Available|Hispanic or ... -2.570039e-01   
3                                       interest_rate  2.228447e-01   
53                                      state_code_WA  1.621884e-01   
56                                 activity_year_2021  1.610170e-01   
25  race_ethnicity_Black or African American|Not H... -1.439353e-01   
58                 occupancy_type_Principal Residence -1.340628e-01   
48            race_ethnicity_White|Hispanic or Latino -1.304856e-01   
55                                 activity_year_2020  1.216335e-01   
41  race_ethnicity_Race Not Available|Ethnicity No...  7.015834e-02   
20    

In [27]:
data_encoded.columns

Index(['application_approved', 'loan_amount', 'property_value', 'loan_term',
       'interest_rate', 'income_log', 'loan_to_value', 'debt_to_income',
       'race_ethnicity_2 or more minority races|Free Form Text Only',
       'race_ethnicity_2 or more minority races|Hispanic or Latino',
       'race_ethnicity_2 or more minority races|Joint',
       'race_ethnicity_2 or more minority races|Not Hispanic or Latino',
       'race_ethnicity_American Indian or Alaska Native|Ethnicity Not Available',
       'race_ethnicity_American Indian or Alaska Native|Free Form Text Only',
       'race_ethnicity_American Indian or Alaska Native|Hispanic or Latino',
       'race_ethnicity_American Indian or Alaska Native|Joint',
       'race_ethnicity_American Indian or Alaska Native|Not Hispanic or Latino',
       'race_ethnicity_Asian|Ethnicity Not Available',
       'race_ethnicity_Asian|Free Form Text Only',
       'race_ethnicity_Asian|Hispanic or Latino', 'race_ethnicity_Asian|Joint',
       'race_e

## Random Forest

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [29]:
# Assuming X contains your features and y contains your target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)


In [31]:
# Make predictions on the testing data
y_pred = rf_model.predict(X_test)


In [32]:
# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.97      0.97    153621
           1       1.00      1.00      1.00   1245634

    accuracy                           0.99   1399255
   macro avg       0.98      0.99      0.98   1399255
weighted avg       0.99      0.99      0.99   1399255

