In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

%matplotlib inline

In [106]:
df = pd.read_csv('data/cleaned_data.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,cb_person_default_on_file,cb_person_cred_hist_length
0,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,True,False,3
1,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,True,False,2
2,21,9900,OWN,2.0,VENTURE,A,2500,7.14,True,False,2
3,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,True,False,4
4,21,10000,OWN,2.0,HOMEIMPROVEMENT,A,4500,8.63,True,False,2


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22294 entries, 0 to 22293
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  22294 non-null  int64  
 1   person_income               22294 non-null  int64  
 2   person_home_ownership       22294 non-null  object 
 3   person_emp_length           22294 non-null  float64
 4   loan_intent                 22294 non-null  object 
 5   loan_grade                  22294 non-null  object 
 6   loan_amnt                   22294 non-null  int64  
 7   loan_int_rate               22294 non-null  float64
 8   loan_status                 22294 non-null  bool   
 9   cb_person_default_on_file   22294 non-null  bool   
 10  cb_person_cred_hist_length  22294 non-null  int64  
dtypes: bool(2), float64(2), int64(4), object(3)
memory usage: 1.6+ MB


In [108]:
num_features = df.select_dtypes(include=['float', 'int']).columns
cat_features = df.select_dtypes(exclude=['float', 'int']).columns
y_cols = ['loan_amnt', 'loan_int_rate', 'loan_grade']
num_features = [feature for feature in num_features if feature not in y_cols]
cat_features = [feature for feature in cat_features if feature not in y_cols]

In [109]:
num_features

['person_age',
 'person_income',
 'person_emp_length',
 'cb_person_cred_hist_length']

In [110]:
cat_features

['person_home_ownership',
 'loan_intent',
 'loan_status',
 'cb_person_default_on_file']

In [111]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [112]:
X = df.drop(columns=y_cols, axis=1)
y = df['loan_amnt']

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [114]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [115]:
from sklearn.decomposition import PCA

# pca = PCA(n_components=2)
# X_train_scaled = pca.fit_transform(X_train_scaled)
# X_test_scaled = pca.transform(X_test_scaled)

In [116]:
models = {
    'LinearRegression': LinearRegression(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models.values()))):
    model = list(models.values())[i]
    model_list.append(list(models.keys())[i])

    model.fit(X_train_scaled, y_train)

    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    r2_train = r2_score(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)

    r2_test = r2_score(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)

    print(list(models.keys())[i])

    print('Train')
    
    print('r2', r2_train)
    print('mae', mae_train)
    print('mse', mse_train)
    print('rmse', rmse_train)
    
    print('Test')
    
    print('r2', r2_test)
    print('mae', mae_test)
    print('mse', mse_test)
    print('rmse', rmse_test)

    r2_list.append(r2_test)

    print('=====================')


LinearRegression
Train
r2 0.13838140767499152
mae 4444.191904390273
mse 33580844.068128236
rmse 5794.898106794306
Test
r2 0.13595391861576867
mae 4393.916919094087
mse 33088925.82561024
rmse 5752.29743890302


KNeighborsRegressor
Train
r2 0.40714679022025624
mae 3657.2811886739555
mse 23105944.289318755
rmse 4806.8642886312855
Test
r2 0.10473233340336974
mae 4469.050235478807
mse 34284566.59004261
rmse 5855.302433695685
SVR
Train
r2 -0.029543718125204954
mae 4661.677813773582
mse 40125581.51326698
rmse 6334.475630489629
Test
r2 -0.021646795634642224
mae 4606.280660189223
mse 39124296.45716346
rmse 6254.941762891439
RandomForestRegressor
Train
r2 0.8900759371332587
mae 1551.1463959198293
mse 4284195.869662435
rmse 2069.8299132205125
Test
r2 0.21631980881467117
mae 4155.172527739511
mse 30011287.911391057
rmse 5478.255918756539
AdaBoostRegressor
Train
r2 0.18320342021899738
mae 4520.085243660825
mse 31833944.654086586
rmse 5642.157801239397
Test
r2 0.15218865440345475
mae 4553.3810933050945
mse 32467211.336243503
rmse 5698.000643755975
GradientBoostingRegressor
Train
r2 0.30161366964343117
mae 4023.5391782263364
mse 27219006.957280032
rmse 5217.1838147874405
Test
r2 0.26607992517698176
mae 408

In [117]:
r2_list = np.multiply(r2_list, 100)

pd.DataFrame(list(zip(model_list, r2_list)), 
             columns=['Model Name', 'R2 Score']).sort_values(by=['R2 Score'], ascending=False)

Unnamed: 0,Model Name,R2 Score
5,GradientBoostingRegressor,26.607993
6,XGBRegressor,23.289174
3,RandomForestRegressor,21.631981
4,AdaBoostRegressor,15.218865
0,LinearRegression,13.595392
1,KNeighborsRegressor,10.473233
2,SVR,-2.16468


In [118]:
# gradient_regressor = GradientBoostingRegressor()

# model.fit(X_train_scaled, y_train)
# y_pred = model.predict(X_test_scaled)

# plt.scatter(y_test, y_pred)