In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

%matplotlib inline

In [58]:
df = pd.read_csv('data/cleaned_data.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,cb_person_default_on_file,cb_person_cred_hist_length
0,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,True,False,3
1,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,True,False,2
2,21,9900,OWN,2.0,VENTURE,A,2500,7.14,True,False,2
3,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,True,False,4
4,21,10000,OWN,2.0,HOMEIMPROVEMENT,A,4500,8.63,True,False,2


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22294 entries, 0 to 22293
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  22294 non-null  int64  
 1   person_income               22294 non-null  int64  
 2   person_home_ownership       22294 non-null  object 
 3   person_emp_length           22294 non-null  float64
 4   loan_intent                 22294 non-null  object 
 5   loan_grade                  22294 non-null  object 
 6   loan_amnt                   22294 non-null  int64  
 7   loan_int_rate               22294 non-null  float64
 8   loan_status                 22294 non-null  bool   
 9   cb_person_default_on_file   22294 non-null  bool   
 10  cb_person_cred_hist_length  22294 non-null  int64  
dtypes: bool(2), float64(2), int64(4), object(3)
memory usage: 1.6+ MB


In [60]:
num_features = df.select_dtypes(include=['float', 'int']).columns
cat_features = df.select_dtypes(exclude=['float', 'int']).columns
y_cols = ['loan_amnt', 'loan_int_rate', 'loan_grade']
num_features = [feature for feature in num_features if feature not in y_cols]
cat_features = [feature for feature in cat_features if feature not in y_cols]

In [61]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [62]:
X = df.drop(columns=y_cols)
y = df['loan_amnt']

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [64]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [70]:
models = [
    ('LinearRegression', LinearRegression()),
    ('KNeighborsRegressor', KNeighborsRegressor()),
    ('SVR', SVR()),
    ('RandomForestRegressor', RandomForestRegressor()),
    ('AdaBoostRegressor', AdaBoostRegressor()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('XGBRegressor', XGBRegressor())
]


for i in range(len(models)):
    model = models[i][1]

    model.fit(X_train_scaled, y_train)

    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    r2_train = r2_score(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)

    r2_test = r2_score(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)

    print(models[i][0])

    print('Train')
    
    print('r2', r2_train)
    print('mae', mae_train)
    print('mse', mse_train)
    print('rmse', rmse_train)
    
    print('Test')
    
    print('r2', r2_test)
    print('mae', mae_test)
    print('mse', mse_test)
    print('rmse', rmse_test)

    print('=====================')


LinearRegression
Train
r2 0.1406729631547322
mae 4434.8122863069775
mse 33504495.695010733
rmse 5788.306807263307
Test
r2 0.1323572728295832
mae 4424.926392692251
mse 33436148.22150845
rmse 5782.399866967732
KNeighborsRegressor
Train
r2 0.4119420569050617
mae 3640.3538430637386
mse 22927923.8032271
rmse 4788.3111639937415
Test
r2 0.08779378994551057
mae 4518.555313944006
mse 35153480.91193259
rmse 5929.037098208493
SVR
Train
r2 -0.03359947062626456
mae 4667.09495552226
mse 40299242.92978884
rmse 6348.1684704951585
Test
r2 -0.029858647634022795
mae 4641.725040577014
mse 39687425.839195706
rmse 6299.795698210832
RandomForestRegressor
Train
r2 0.8899868535955913
mae 1557.1055716354667
mse 4289327.383009821
rmse 2071.069140084372
Test
r2 0.22182327028478688
mae 4169.200558862761
mse 29988417.65451237
rmse 5476.168154331308
AdaBoostRegressor
Train
r2 0.23154198459307196
mae 4338.831804482267
mse 29961582.9190231
rmse 5473.717467957503
Test
r2 0.20250524364880296
mae 4409.021066749627
mse 30