In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/content/bondora_final (1).csv')
data

Unnamed: 0.1,Unnamed: 0,index,NewCreditCustomer,Gender,Education,EmploymentStatus,Restructured,Amount,Interest,LoanDuration,IncomeTotal,LiabilitiesTotal,Target,Preferred ROI,EMI,ELA
0,0,0,1,1.0,3.0,3.0,0,115.0408,30.00,12,10500.0,0.0,1,34.512240,3970.32,10500.0
1,1,1,0,1.0,5.0,3.0,0,140.6057,25.00,1,10800.0,0.0,1,35.151425,5083.10,10800.0
2,2,2,1,1.0,4.0,3.0,1,319.5409,25.00,20,7000.0,0.0,0,79.885225,25526.60,7000.0
3,3,3,1,1.0,2.0,3.0,0,57.5205,45.00,15,11600.0,0.0,1,25.884225,1488.87,11600.0
4,4,4,1,1.0,4.0,3.0,0,319.5436,30.00,12,6800.0,0.0,0,95.863080,30632.43,6800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34672,34672,34717,1,0.0,5.0,-1.0,0,530.0000,22.33,60,640.0,24.0,1,118.349000,62724.97,616.0
34673,34673,34718,1,0.0,4.0,-1.0,0,3185.0000,48.90,36,2200.0,1260.0,0,1557.465000,4960526.02,940.0
34674,34674,34719,1,0.0,4.0,-1.0,0,1590.0000,33.70,36,500.0,74.0,1,535.830000,851969.70,426.0
34675,34675,34720,1,0.0,3.0,-1.0,0,530.0000,11.18,36,1100.0,808.0,0,59.254000,31404.62,292.0


In [3]:
data.drop(['Unnamed: 0', 'index'], axis = 1, inplace = True)
data

Unnamed: 0,NewCreditCustomer,Gender,Education,EmploymentStatus,Restructured,Amount,Interest,LoanDuration,IncomeTotal,LiabilitiesTotal,Target,Preferred ROI,EMI,ELA
0,1,1.0,3.0,3.0,0,115.0408,30.00,12,10500.0,0.0,1,34.512240,3970.32,10500.0
1,0,1.0,5.0,3.0,0,140.6057,25.00,1,10800.0,0.0,1,35.151425,5083.10,10800.0
2,1,1.0,4.0,3.0,1,319.5409,25.00,20,7000.0,0.0,0,79.885225,25526.60,7000.0
3,1,1.0,2.0,3.0,0,57.5205,45.00,15,11600.0,0.0,1,25.884225,1488.87,11600.0
4,1,1.0,4.0,3.0,0,319.5436,30.00,12,6800.0,0.0,0,95.863080,30632.43,6800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34672,1,0.0,5.0,-1.0,0,530.0000,22.33,60,640.0,24.0,1,118.349000,62724.97,616.0
34673,1,0.0,4.0,-1.0,0,3185.0000,48.90,36,2200.0,1260.0,0,1557.465000,4960526.02,940.0
34674,1,0.0,4.0,-1.0,0,1590.0000,33.70,36,500.0,74.0,1,535.830000,851969.70,426.0
34675,1,0.0,3.0,-1.0,0,530.0000,11.18,36,1100.0,808.0,0,59.254000,31404.62,292.0


In [4]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(data.drop(['EMI', 'ELA', 'Preferred ROI'], axis=1), data[['EMI', 'ELA', 'Preferred ROI', 'Target']], test_size = 0.3, random_state = 1)

In [5]:
# Define the preprocessing steps for the numerical and categorical features

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [6]:
data.columns

Index(['NewCreditCustomer', 'Gender', 'Education', 'EmploymentStatus',
       'Restructured', 'Amount', 'Interest', 'LoanDuration', 'IncomeTotal',
       'LiabilitiesTotal', 'Target', 'Preferred ROI', 'EMI', 'ELA'],
      dtype='object')

In [7]:
# Combine the preprocessing steps using ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Amount', 'Interest', 'LoanDuration', 'IncomeTotal', 'LiabilitiesTotal']),
        ('cat', categorical_transformer, ['Target', 'Gender', 'Education', 'EmploymentStatus', 'NewCreditCustomer', 'Restructured'])
    ])

#### Linear Regression

In [8]:
# Define the regression model

model = LinearRegression()

In [9]:
# Define the pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [10]:
# Fit the pipeline on the training data

pipeline.fit(X_train, y_train)

In [11]:
# Make predictions on the testing data

y_pred = pipeline.predict(X_test)

In [12]:
# Evaluate the performance of the model using mean squared error and R-squared

print('R-squared:', r2_score(y_test, y_pred))

R-squared: 0.868636449079853


#### Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

In [14]:
# select only the 'ELA' target variable

y_train_ela = y_train['ELA']
y_test_ela = y_test['ELA']

# create a pipeline

model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model

model_pipeline.fit(X_train, y_train_ela)

# make predictions

y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model

r2 = r2_score(y_test_ela, y_pred_ela)
print('R-squared:', r2)

R-squared: 0.8220472624038184


In [15]:
# select only the 'ROI' target variable

y_train_ela = y_train['Preferred ROI']
y_test_ela = y_test['Preferred ROI']

# create a pipeline

model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model

model_pipeline.fit(X_train, y_train_ela)

# make predictions

y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model

r2 = r2_score(y_test_ela, y_pred_ela)
print('R-squared:', r2)

R-squared: 0.9954888482072922


In [16]:
 # select only the 'EMI' target variable

y_train_ela = y_train['EMI']
y_test_ela = y_test['EMI']

# create a pipeline

model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model

model_pipeline.fit(X_train, y_train_ela)

# make predictions

y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model

r2 = r2_score(y_test_ela, y_pred_ela)
print('R-squared:', r2)

R-squared: 0.9963802081245146


In [17]:
# select only the 'Target' target variable

y_train_ela = y_train['Target']
y_test_ela = y_test['Target']

# create a pipeline

model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('model', GradientBoostingRegressor(random_state=0))
])

# fit the model

model_pipeline.fit(X_train, y_train_ela)

# make predictions

y_pred_ela = model_pipeline.predict(X_test)

# evaluate the model

r2 = r2_score(y_test_ela, y_pred_ela)
print('R-squared:', r2)

R-squared: 0.9999999992944254


#### Saving the Model

In [18]:
import joblib
import pickle

In [19]:
# Save the Gradient Boosting model

with open('gb_model.pkl', 'wb') as f:

    pickle.dump(model_pipeline, f)

# Save the Linear Regression model

with open('lr_model.pkl', 'wb') as f:
  
    pickle.dump(pipeline, f)