# PAYGO LOAN DEFAULT

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error as mse,r2_score
import warnings
warnings.filterwarnings('ignore')
from joblib import dump

In [2]:
train = pd.read_csv('Train.csv',index_col='ID')

In [3]:
class preprocess():
    
    def __init__(self):
        self.df_data = None
        
    def age_grouper(self,x):
        if x<30:
            return '<31'
        elif x<50:
            return '31-60'
        elif x>60:
            return '>60'
        else:
            return 'other' #not given or missing
        
        
    def create_dict_converter(self):#train_data
        # Get dictionaries for labels
        for i in self.df_data.columns:
            if self.df_data[i].dtype == 'object' :
                new_dict = dict()
                for j,unique_value in enumerate(self.df_data[i].unique()):
                    new_dict[unique_value] = j 
                name_ = 'dict_'+str(i) 
                globals()[name_]=new_dict
        return self
                   
            
    def fit(self,X,Y):#train 
        self.Y = Y
        return self
    
    
    def transform(self,X,Y=None): 
        metadata = pd.read_csv('metadata.csv',index_col='ID')
        cols = ['RegistrationDate','ExpectedTermDate','FirstPaymentDate','LastPaymentDate']
        metadata[cols] = metadata[cols].apply(lambda x:pd.to_datetime(x, format='%d/%m/%Y %H:%M') )
        metadata['days_to/past_deadline'] = (metadata['ExpectedTermDate'] - metadata['LastPaymentDate']).apply(lambda x:x.days) 
        metadata['days_from_start'] = (metadata['LastPaymentDate'] - metadata['RegistrationDate']).apply(lambda x:x.days)
        metadata['Region'] = metadata['Region'].fillna('Not Given')
        metadata['Age'] = metadata['Age'].apply(self.age_grouper)
        cols_drop =['PaymentMethod','SupplierName','UpsellDate','AccessoryRate','Town',
                   'RegistrationDate','ExpectedTermDate','FirstPaymentDate','LastPaymentDate','TransactionDates',
                    'PaymentsHistory','rateTypeEntity','RatePerUnit','DaysOnDeposit']
        
        X_=X.copy()
        X_['mean_payment'] = X_['PaymentsHistory'].apply(lambda x:np.mean(list(map(float, x.strip('[]').split(','))) ))
        X_['max_payment'] = X_['PaymentsHistory'].apply(lambda x:np.max(list(map(float, x.strip('[]').split(','))) ))
        X_['min_payment'] = X_['PaymentsHistory'].apply(lambda x:np.min(list(map(float, x.strip('[]').split(','))) ))
    
        self.df_data = pd.concat([metadata, X_],axis=1).drop(cols_drop, axis=1)
        
        if self.Y is not None:
            if any(item in self.Y.columns for item in ['m1','m2','m3']):
                self.create_dict_converter()
            self.df_data = self.df_data.dropna(axis=0,how='any')#drop nulls for train data only
        
        self.df_data = self.df_data.fillna(self.df_data.mode())#fill nulls (most likely for test set) 
        
        for i in self.df_data.columns:
            self.create_dict_converter()
            if self.df_data[i].dtype == 'object':
                self.df_data[i] = self.df_data[i].astype("category").apply(lambda x:globals()['dict_'+str(i)][x])
       
               
        return self.df_data
 

In [4]:
pipeline = Pipeline(steps=[ ('preprocess_func',preprocess()),
                           ('model',RFR(n_estimators= 600, max_depth=10))
                          ])

x = train[[i for i in train.columns if i not in ['m1','m2','m3','m4','m5','m6']]]
y = train[['m1','m2','m3','m4','m5','m6']]

x_train,x_test ,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 123)

In [5]:
pipeline.fit(x_train,y_train)

Pipeline(steps=[('preprocess_func',
                 <__main__.preprocess object at 0x000002A4C60C01C8>),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=600))])

In [6]:
y_pred = pipeline.predict(x_test)

In [8]:
x.shape

(28007, 2)

In [11]:
print("R2 Score : {}\nRMSE Score : {}".format(r2_score(y_pred,y_test) , mse(y_pred,y_test,squared=False)))

R2 Score : -45.24333893164283
RMSE Score : 797.7902274272628


In [8]:
#dump the pipeline model
dump(pipeline, filename="paygo_default.joblib")

['paygo_default.joblib']

In [11]:
test =  pd.read_csv("Test.csv",index_col=('ID') )
result = pipeline.predict(test)
np.shape(result)

(9336, 6)

In [21]:
cols = ['m1','m2','m3','m4','m5','m6']
cvv = test.copy()
result = pipeline.predict(test) 


In [22]:
cvv[cols] =  result
cvv[cols]

Unnamed: 0_level_0,m1,m2,m3,m4,m5,m6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_6L67PAA,957.706433,875.211857,1241.287686,1496.864375,849.353245,1122.993737
ID_VJ80SX2,867.068576,846.921392,845.960589,856.846107,867.296994,746.306184
ID_7OU9HLK,992.904328,931.922676,1005.205780,1504.496116,924.499364,734.343290
ID_WVWTPGK,823.273255,811.305465,801.884231,810.800066,831.368430,670.687547
ID_04DSDQS,838.439382,818.143638,809.707588,819.209287,845.678112,676.892880
...,...,...,...,...,...,...
ID_ATLMXQX,808.388842,798.159043,789.507141,798.270741,830.032727,677.493955
ID_DYMC72D,810.058705,803.339147,792.513113,806.603545,819.810449,670.435564
ID_5CBGHCN,857.068317,856.848862,843.312518,847.910844,880.890323,742.800992
ID_HYHB585,810.173014,799.067761,793.634272,802.877324,823.411599,711.875967
