In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Read Prep Data

In [4]:
# df_selected_small.to_parquet(r"C:\Users\Forcessofnature\Downloads\df_small.parquet")
data_encoded = pd.read_parquet(r"C:\Users\Forcessofnature\Downloads\df_small_encoded.parquet")

## Set up Features and Target

In [5]:
# Step 4: Prepare Data for Logistic Regression
X = data_encoded.drop(['application_approved'], axis=1)  # Features
y = data_encoded['application_approved']  # Target variable

In [6]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
#fill any nan values in y with 0
y_train = y_train.fillna(0)

#any nan values in X_train fill with 0
X_train = X_train.fillna(0)

#any nan values in X_train fill with 0
X_train = X_train.fillna(0)

#any nan values in X_test fill with 0
X_test = X_test.fillna(0)


In [8]:
from sklearn.linear_model import LogisticRegression

# Define and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions
y_pred = model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix

              precision    recall  f1-score   support

           0       0.36      0.02      0.04    153621
           1       0.89      1.00      0.94   1245634

    accuracy                           0.89   1399255
   macro avg       0.63      0.51      0.49   1399255
weighted avg       0.83      0.89      0.84   1399255



In [10]:
# Extract coefficients and corresponding feature names
coefficients = model.coef_[0]
feature_names = X.columns

# Create a DataFrame to display coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort coefficients by absolute value to identify significant variables
coefficients_df['Abs_Coefficient'] = abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

# Add intercept to the DataFrame
intercept_row = pd.DataFrame({'Feature': 'Intercept', 'Coefficient': intercept, 'Abs_Coefficient': abs(intercept)}, index=[0])
coefficients_df = pd.concat([intercept_row, coefficients_df]).reset_index(drop=True)

# Display coefficients and feature names
print(coefficients_df)

                                               Feature  Coefficient  \
52                                       state_code_TX    -0.448948   
4                                           income_log     0.390970   
50         race_ethnicity_White|Not Hispanic or Latino     0.373799   
57                                  activity_year_2022    -0.354838   
43   race_ethnicity_Race Not Available|Hispanic or ...    -0.274510   
..                                                 ...          ...   
250                                  county_code_42041     0.000000   
76                                   county_code_05073     0.000000   
152                                  county_code_13297     0.000000   
138                                  county_code_08043     0.000000   
166                                  county_code_30081     0.000000   

     Abs_Coefficient  
52          0.448948  
4           0.390970  
50          0.373799  
57          0.354838  
43          0.274510  
..       

In [20]:
coefficients_df.to_csv(r"C:\Users\Forcessofnature\Downloads\coef.csv")

In [11]:
data_encoded.columns

Index(['application_approved', 'loan_amount', 'property_value', 'loan_term',
       'interest_rate', 'income_log', 'loan_to_value', 'debt_to_income',
       'race_ethnicity_2 or more minority races|Free Form Text Only',
       'race_ethnicity_2 or more minority races|Hispanic or Latino',
       ...
       'derived_sex_Male', 'derived_sex_Sex Not Available',
       'applicant_age_25-34', 'applicant_age_35-44', 'applicant_age_45-54',
       'applicant_age_55-64', 'applicant_age_65-74', 'applicant_age_9999',
       'applicant_age_<25', 'applicant_age_>74'],
      dtype='object', length=573)

## Random Forest

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [29]:
# Assuming X contains your features and y contains your target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)


In [14]:
# Make predictions on the testing data
y_pred = rf_model.predict(X_test)


In [15]:
# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.98      0.97    153621
           1       1.00      1.00      1.00   1245634

    accuracy                           0.99   1399255
   macro avg       0.98      0.99      0.98   1399255
weighted avg       0.99      0.99      0.99   1399255



In [21]:

# Assuming you have already trained the Random Forest model (rf_model) and X_train contains your training features

# Extract feature importances
feature_importances = rf_model.feature_importances_

# Extract feature names
feature_names = X_train.columns

# Create a DataFrame to display feature importances and feature names
feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort feature importances
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print(feature_importances_df)


               Feature  Importance
3        interest_rate    0.651533
5        loan_to_value    0.058696
2            loan_term    0.047473
6       debt_to_income    0.046865
1       property_value    0.042515
..                 ...         ...
141  county_code_09001    0.000000
76   county_code_05073    0.000000
77   county_code_05091    0.000000
253  county_code_42115    0.000000
152  county_code_13297    0.000000

[572 rows x 2 columns]


In [22]:
feature_importances_df.to_csv(r"C:\Users\Forcessofnature\Downloads\rf_coef.csv")