### Notebook Preperation:

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

---

In [3]:
pd.set_option('display.max_columns', None)

---

### Loading Data, Creating Target variable, and Preprocessing:

**Loading data**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df= pd.read_csv("/content/drive/MyDrive/final_kk_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,NewCreditCustomer,VerificationType,LanguageCode,Age,Gender,Country,AppliedAmount,Amount,Interest,LoanDuration,MonthlyPayment,UseOfLoan,Education,MaritalStatus,EmploymentStatus,EmploymentDurationCurrentEmployer,OccupationArea,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,RefinanceLiabilities,DebtToIncome,FreeCash,Rating,Restructured,CreditScoreEsMicroL,PrincipalPaymentsMade,InterestAndPenaltyPaymentsMade,PrincipalBalance,InterestAndPenaltyBalance,NoOfPreviousLoansBeforeLoan,AmountOfPreviousLoansBeforeLoan,PreviousRepaymentsBeforeLoan,PreviousEarlyRepaymentsCountBeforeLoan,LoanStatus
0,0,0.0,0.0,115.041,True,"Income unverified,cross-referenced by phone",Estonian,61,Female,EE,319.5582,115.0408,30.0,12,130.393314,Other,Vocational education,Married,Fully employed,UpTo3Years,Retail and wholesale,Not specified,3665.0,0,0.0,0,0.0,0.0,F,False,M,115.0408,20.4222,0.0,0.0,1.0,83.0852,0.0,0,NotDefault
1,1,0.0,0.0,140.6057,False,"Income unverified,cross-referenced by phone",Estonian,48,Female,EE,191.7349,140.6057,25.0,1,130.393314,Other,Higher education,Divorced,Fully employed,MoreThan5Years,Education,Not specified,3665.0,0,0.0,0,0.0,0.0,F,False,M,140.6057,2.0227,0.0,0.0,2.0,255.6467,258.6256,0,NotDefault
2,2,0.0,0.0,319.558,True,"Income unverified,cross-referenced by phone",Estonian,58,Female,EE,319.5582,319.5409,25.0,20,130.393314,Home improvement,Secondary education,Married,Fully employed,UpTo4Years,Hospitality and catering,Not specified,3665.0,0,0.0,0,0.0,0.0,F,True,M,203.1909,59.7626,116.35,414.07,0.0,0.0,0.0,0,Default
3,3,0.0,0.0,57.5205,True,"Income unverified,cross-referenced by phone",Estonian,23,Female,EE,127.8233,57.5205,45.0,15,130.393314,Loan consolidation,Basic education,Single,Not present,UpTo2Years,Other,Not specified,3665.0,0,0.0,0,0.0,0.0,F,False,M,57.5205,18.7323,0.0,0.0,1.0,134.2144,0.0,0,NotDefault
4,4,0.0,0.0,319.5582,True,"Income unverified,cross-referenced by phone",Estonian,25,Female,EE,319.5582,319.5436,30.0,12,130.393314,Vehicle,Secondary education,Cohabitant,Fully employed,UpTo2Years,Retail and wholesale,Not specified,3665.0,0,0.0,0,0.0,0.0,F,False,M,319.5436,220.42,0.0,0.0,1.0,146.9966,0.0,0,Default


In [6]:

print(df.isnull().sum().sum())
df.columns

0


Index(['Unnamed: 0', 'BidsPortfolioManager', 'BidsApi', 'BidsManual',
       'NewCreditCustomer', 'VerificationType', 'LanguageCode', 'Age',
       'Gender', 'Country', 'AppliedAmount', 'Amount', 'Interest',
       'LoanDuration', 'MonthlyPayment', 'UseOfLoan', 'Education',
       'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'LiabilitiesTotal', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash',
       'Rating', 'Restructured', 'CreditScoreEsMicroL',
       'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade',
       'PrincipalBalance', 'InterestAndPenaltyBalance',
       'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan',
       'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus'],
      dtype='object')

#### **Creating Target Variables**

**Calculating LoanTenure**

In [7]:
# Load raw data
raw_data = pd.read_csv('/content/drive/MyDrive/Bondora_raw.csv')

# Filter the for Status = Repaid or Late
raw_data = raw_data[raw_data['Status'].isin(['Repaid', 'Late'])]

# Choose only needed columns to calculate loan tenure
raw_data = raw_data[['MaturityDate_Original', 'LoanDate']]

In [8]:
# Convert date columns into datetime objects
for col in raw_data.columns.values:
    raw_data[col] = pd.to_datetime(raw_data[col])

# Calculate LoanTenure
raw_data['LoanTenure'] = ((raw_data['MaturityDate_Original'].dt.year - raw_data['LoanDate'].dt.year)*12 +\
     (raw_data['MaturityDate_Original'].dt.month - raw_data['LoanDate'].dt.month))

# Set that column in loan_data
df['LoanTenure'] = raw_data['LoanTenure'].values

# Drop LoanDuration as it has errors
df = df.drop('LoanDuration', axis=1)

I. Equaty Monthly Installments (EMI)

In [9]:
loan_data_temp1 = df[[ 'Amount', 'Interest','LoanTenure']]
loan_data_temp1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77394 entries, 0 to 77393
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Amount      77394 non-null  float64
 1   Interest    77394 non-null  float64
 2   LoanTenure  77394 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 1.8 MB


In [10]:
def emi_cal(p, r, n):
  r = ((r /12)/100)
  emi = np.round((p * r * pow(1 + r, n)) / (pow(1 + r, n) - 1))
  return emi 

In [11]:
loan_data_temp1['EMI'] = emi_cal(loan_data_temp1['Amount'], loan_data_temp1['Interest'], loan_data_temp1['LoanTenure'])
loan_data_temp1['tot_pay_back__amt']= loan_data_temp1['EMI'] * loan_data_temp1['LoanTenure']

In [12]:
df['EMI'] = loan_data_temp1['EMI']

In [13]:
loan_data_temp1.head(20)

Unnamed: 0,Amount,Interest,LoanTenure,EMI,tot_pay_back__amt
0,115.0408,30.0,12,11.0,132.0
1,140.6057,25.0,1,144.0,144.0
2,319.5409,25.0,20,20.0,400.0
3,57.5205,45.0,15,5.0,75.0
4,319.5436,30.0,12,31.0,372.0
5,300.4314,30.0,25,16.0,400.0
6,191.7445,32.0,21,12.0,252.0
7,31.9518,20.0,7,5.0,35.0
8,31.9498,20.0,13,3.0,39.0
9,319.5583,25.0,13,28.0,364.0


II. Eligible Loan Amount (ELA)

In [14]:
loan_data_temp2 = df[['AppliedAmount', 'Interest', 'IncomeTotal', 'LiabilitiesTotal', 'LoanTenure']]
loan_data_temp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77394 entries, 0 to 77393
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AppliedAmount     77394 non-null  float64
 1   Interest          77394 non-null  float64
 2   IncomeTotal       77394 non-null  float64
 3   LiabilitiesTotal  77394 non-null  float64
 4   LoanTenure        77394 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 3.0 MB


In [15]:
def avlb_incm(inc,lia):
  avlb_incm = np.round((inc-lia)*0.3)
  return avlb_incm

def tot_amt_pay(app_amt, r,n):
  amt_pay_month = np.round((app_amt+(app_amt * (r/100) * (n/12))) / n)
  return amt_pay_month


In [16]:
# Step 1
loan_data_temp2['Avlb_Incm_Monthly'] = avlb_incm (loan_data_temp2['IncomeTotal'],loan_data_temp2['LiabilitiesTotal'])
loan_data_temp2['Amt_pay_Monthly'] = tot_amt_pay(loan_data_temp2['AppliedAmount'],loan_data_temp2['Interest'],loan_data_temp2['LoanTenure'])
loan_data_temp2.head()

Unnamed: 0,AppliedAmount,Interest,IncomeTotal,LiabilitiesTotal,LoanTenure,Avlb_Incm_Monthly,Amt_pay_Monthly
0,319.5582,30.0,3665.0,0.0,12,1100.0,35.0
1,191.7349,25.0,3665.0,0.0,1,1100.0,196.0
2,319.5582,25.0,3665.0,0.0,20,1100.0,23.0
3,127.8233,45.0,3665.0,0.0,15,1100.0,13.0
4,319.5582,30.0,3665.0,0.0,12,1100.0,35.0


In [17]:
def ela(df):
  avlb_incm = df['Avlb_Incm_Monthly'].values
  tot_amt_pay= df['Amt_pay_Monthly'].values 
  n = df['LoanTenure'].values
  ELA = np.empty(0)
  for i in range(len(avlb_incm)):
    if tot_amt_pay[i] <= avlb_incm[i]:
      ELA = np.append(ELA, (tot_amt_pay[i] * n[i]))
    else:
      ELA = np.append(ELA, (avlb_incm[i] * n[i]))  
  return ELA

In [18]:
loan_data_temp2['ELA'] = ela(loan_data_temp2)
loan_data_temp2.head(10)

Unnamed: 0,AppliedAmount,Interest,IncomeTotal,LiabilitiesTotal,LoanTenure,Avlb_Incm_Monthly,Amt_pay_Monthly,ELA
0,319.5582,30.0,3665.0,0.0,12,1100.0,35.0,420.0
1,191.7349,25.0,3665.0,0.0,1,1100.0,196.0,196.0
2,319.5582,25.0,3665.0,0.0,20,1100.0,23.0,460.0
3,127.8233,45.0,3665.0,0.0,15,1100.0,13.0,195.0
4,319.5582,30.0,3665.0,0.0,12,1100.0,35.0,420.0
5,300.3847,30.0,3665.0,0.0,25,1100.0,20.0,500.0
6,191.7349,32.0,3665.0,0.0,21,1100.0,14.0,294.0
7,191.7349,20.0,3665.0,0.0,7,1100.0,31.0,217.0
8,178.9526,20.0,3665.0,0.0,13,1100.0,17.0,221.0
9,319.5582,25.0,3665.0,0.0,13,1100.0,31.0,403.0


In [19]:
df['ELA'] = loan_data_temp2['ELA']
df.columns

Index(['Unnamed: 0', 'BidsPortfolioManager', 'BidsApi', 'BidsManual',
       'NewCreditCustomer', 'VerificationType', 'LanguageCode', 'Age',
       'Gender', 'Country', 'AppliedAmount', 'Amount', 'Interest',
       'MonthlyPayment', 'UseOfLoan', 'Education', 'MaritalStatus',
       'EmploymentStatus', 'EmploymentDurationCurrentEmployer',
       'OccupationArea', 'HomeOwnershipType', 'IncomeTotal',
       'ExistingLiabilities', 'LiabilitiesTotal', 'RefinanceLiabilities',
       'DebtToIncome', 'FreeCash', 'Rating', 'Restructured',
       'CreditScoreEsMicroL', 'PrincipalPaymentsMade',
       'InterestAndPenaltyPaymentsMade', 'PrincipalBalance',
       'InterestAndPenaltyBalance', 'NoOfPreviousLoansBeforeLoan',
       'AmountOfPreviousLoansBeforeLoan', 'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus', 'LoanTenure',
       'EMI', 'ELA'],
      dtype='object')

III. Preferred ROI (PROI) ---> needs redefining

In [20]:
loan_data_temp3=df[[ 'Amount','AppliedAmount', 'Interest','LoanTenure','IncomeTotal','DebtToIncome']]
loan_data_temp3.head()

Unnamed: 0,Amount,AppliedAmount,Interest,LoanTenure,IncomeTotal,DebtToIncome
0,115.0408,319.5582,30.0,12,3665.0,0.0
1,140.6057,191.7349,25.0,1,3665.0,0.0
2,319.5409,319.5582,25.0,20,3665.0,0.0
3,57.5205,127.8233,45.0,15,3665.0,0.0
4,319.5436,319.5582,30.0,12,3665.0,0.0


In [21]:
loan_data_temp3['InterestAmount'] = (loan_data_temp3['Amount']*(loan_data_temp3['Interest']/100)*(loan_data_temp3['LoanTenure']/12))
loan_data_temp3['TotalAmount'] = (loan_data_temp3['InterestAmount'] + loan_data_temp3['Amount'])
loan_data_temp3['ROI'] = (loan_data_temp3['InterestAmount'] / loan_data_temp3['Amount'])*100
df['ROI'] = loan_data_temp3['ROI']

In [22]:

def PROI(df):
    df['PROI'] = df['ROI'].median()      # Setting PROI

    for i in range(df.shape[0]):
        # Check out LoanTenure
        if df['LoanTenure'].loc[i] <= 19:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['LoanTenure'].loc[i] > 25:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5
        
        # Check out AppliedAmount
        if (df['AppliedAmount'].loc[i] <= 1175) & (df['AppliedAmount'].loc[i] >= 850):
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['AppliedAmount'].loc[i] > 2000:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

        # Check out IncomeTotal
        if df['IncomeTotal'].loc[i] <= 1000:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5

        # Check out DebtToIncome
        if df['DebtToIncome'].loc[i] == 0:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        else:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

    return df['PROI']

In [23]:
loan_data_temp3['PROI'] = PROI(loan_data_temp3)
df['PROI'] = loan_data_temp3['PROI']

VI. LoanStatus

In [24]:
df['LoanStatus'] = np.where(df['LoanStatus']=='NotDefault', 0, 1)

In [25]:
df.columns

Index(['Unnamed: 0', 'BidsPortfolioManager', 'BidsApi', 'BidsManual',
       'NewCreditCustomer', 'VerificationType', 'LanguageCode', 'Age',
       'Gender', 'Country', 'AppliedAmount', 'Amount', 'Interest',
       'MonthlyPayment', 'UseOfLoan', 'Education', 'MaritalStatus',
       'EmploymentStatus', 'EmploymentDurationCurrentEmployer',
       'OccupationArea', 'HomeOwnershipType', 'IncomeTotal',
       'ExistingLiabilities', 'LiabilitiesTotal', 'RefinanceLiabilities',
       'DebtToIncome', 'FreeCash', 'Rating', 'Restructured',
       'CreditScoreEsMicroL', 'PrincipalPaymentsMade',
       'InterestAndPenaltyPaymentsMade', 'PrincipalBalance',
       'InterestAndPenaltyBalance', 'NoOfPreviousLoansBeforeLoan',
       'AmountOfPreviousLoansBeforeLoan', 'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus', 'LoanTenure',
       'EMI', 'ELA', 'ROI', 'PROI'],
      dtype='object')

In [26]:
# Other Unwanted Columns

df.drop(columns=['Unnamed: 0','PreviousEarlyRepaymentsCountBeforeLoan','LoanTenure','LiabilitiesTotal','PrincipalBalance','InterestAndPenaltyBalance'], inplace = True)

In [27]:
# df= df.drop(columns=['Amount','PreviousEarlyRepaymentsCountBeforeLoan','NoOfPreviousLoansBeforeLoan','LoanTenure','ROI'
#                  'LiabilitiesTotal','PrincipalBalance','InterestAndPenaltyBalance','AmountOfPreviousLoansBeforeLoan'])

In [28]:
df.shape

(77394, 38)

**1. Imputing both the Categorical and Numerical Features having Missing Values.**

In [29]:
#Categorical Features in Dataset 
catg_features = [col_name for col_name in df.columns if df[col_name].dtype == 'O']
print("Number of Categorical Features:",format(len(catg_features )))
print("Categorical Features: {}",catg_features )

Number of Categorical Features: 13
Categorical Features: {} ['VerificationType', 'LanguageCode', 'Gender', 'Country', 'UseOfLoan', 'Education', 'MaritalStatus', 'EmploymentStatus', 'EmploymentDurationCurrentEmployer', 'OccupationArea', 'HomeOwnershipType', 'Rating', 'CreditScoreEsMicroL']


In [30]:
#Imputing the missing values in categorical features using the most frequent value which is mode
catg_features_with_null = [feature for feature in catg_features if df[feature].isnull().sum()]
for each_feature in catg_features_with_null:
  mode_val = df[each_feature].mode()[0]
  df[each_feature].fillna(mode_val,inplace=True)

In [31]:
#Numerical Features in Dataset
num_features = [col_name for col_name in df.columns if df[col_name].dtype != 'O']
print("Number of Numerical Features: ",format(len(num_features)))
print("Numerical Features: {}",num_features)

Number of Numerical Features:  25
Numerical Features: {} ['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer', 'Age', 'AppliedAmount', 'Amount', 'Interest', 'MonthlyPayment', 'IncomeTotal', 'ExistingLiabilities', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash', 'Restructured', 'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade', 'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan', 'PreviousRepaymentsBeforeLoan', 'LoanStatus', 'EMI', 'ELA', 'ROI', 'PROI']


In [32]:
#Impute missing values in numerical features using mean
num_features_with_null = [feature for feature in num_features if df[feature].isnull().sum()]
for feature in num_features_with_null:
   mean_value = df[feature].mean()
   df[feature].fillna(mean_value,inplace=True)

#### **2. Handling Outliers:**

In [33]:
#Let's compute IQR for each numerical feature

df_IQR = df[df.select_dtypes([float, int]).columns].quantile(.75) - df[df.select_dtypes([float, int]).columns].quantile(.25)

# Let's compute maximum and minimum limits
df_Max =  df[df.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  df[df.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

In [34]:
#Loop for replacing outliers above upper bound with the upper bound value:
for column in df.select_dtypes([float, int]).columns :
  col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
  col_Max =  df[column].quantile(.75) + (1.5*col_IQR)
  df[column][df[column] > col_Max] =  col_Max  

In [35]:
#Loop for replacing outliers under lower bound with the lower bound value:
for column in df.select_dtypes([float, int]).columns :
    col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
    col_Min =  df[column].quantile(.25) - (1.5*col_IQR)
    df[column][df[column] < col_Min] =  col_Min

In [36]:
df['PreviousRepaymentsBeforeLoan']=df['PreviousRepaymentsBeforeLoan'].round(decimals = 2)

In [37]:
df['NewCreditCustomer'] = df['NewCreditCustomer'].replace({True: 'True', False: 'False'})   
df['Restructured'] = df['Restructured'].replace({True: 'True', False: 'False'})   

In [38]:
df.dtypes

BidsPortfolioManager                 float64
BidsApi                              float64
BidsManual                           float64
NewCreditCustomer                     object
VerificationType                      object
LanguageCode                          object
Age                                    int64
Gender                                object
Country                               object
AppliedAmount                        float64
Amount                               float64
Interest                             float64
MonthlyPayment                       float64
UseOfLoan                             object
Education                             object
MaritalStatus                         object
EmploymentStatus                      object
EmploymentDurationCurrentEmployer     object
OccupationArea                        object
HomeOwnershipType                     object
IncomeTotal                          float64
ExistingLiabilities                    int64
RefinanceL

#### 3. **X, y split**

In [39]:
# Defining Independent variables Dataset
X = df.drop(['EMI', 'ELA', 'PROI', 'LoanStatus'], axis=1)

# Assigning target variables for both Models 
y = df[['LoanStatus', 'EMI', 'ELA', 'PROI']]

#### **4. Feature Selection**

In [40]:
# A function to select highly correlated features.
def Correlation(dataset, threshold): 
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

In [41]:
# let's selected features with a correlation factor > 0.8
Correlation(X, 0.8)

{'Amount', 'AmountOfPreviousLoansBeforeLoan', 'ROI'}

In [42]:
# Now we can drop these features from our dataset
X.drop(columns= [ 'ROI', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan'], inplace = True )

In [43]:
print(X.shape)
X.columns

(77394, 30)


Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer',
       'VerificationType', 'LanguageCode', 'Age', 'Gender', 'Country',
       'AppliedAmount', 'Interest', 'MonthlyPayment', 'UseOfLoan', 'Education',
       'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'RefinanceLiabilities', 'DebtToIncome', 'FreeCash', 'Rating',
       'Restructured', 'CreditScoreEsMicroL', 'PrincipalPaymentsMade',
       'InterestAndPenaltyPaymentsMade', 'PreviousRepaymentsBeforeLoan'],
      dtype='object')

#### **5.Feature Encoding**

In [44]:
# X = pd.get_dummies(X, drop_first=True)

---

In [45]:
X.shape

(77394, 30)

#### **6. train, test split**

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Separating Target values for classifications and regression problems
y_class_train = y_train.iloc[:,0]
y_reg_train = y_train.iloc[:,1:]

y_class_test = y_test.iloc[:,0]
y_reg_test = y_test.iloc[:,1:]

In [47]:
df.to_csv('df_kk.csv')

In [48]:
input=X_test.iloc[0].values

In [49]:
input=input.reshape(1,-1)

In [50]:
input

array([[1176.0, 0.0, 263.0, 'True', 'Income and expenses verified',
        'Finnish', 59, 'Male', 'FI', 2125.0, 59.73, 118.92, 'Not Set',
        'Primary education', 'Not Specified', 'Not present', 'Retiree',
        'Not present', 'Tenant,pre-furnished property', 1600.0, 0, 0,
        0.0, 0.0, 'HR', 'False', 'M', 0.0, 0.0, 861.14]], dtype=object)

In [51]:
# from sklearn.impute import SimpleImputer

# numerical processing pipeline
numeric_processor=Pipeline(
    steps=[('stdscaler', StandardScaler(with_mean=False))]

)

numeric_processor

Pipeline(steps=[('stdscaler', StandardScaler(with_mean=False))])

In [52]:
# col_tra_1=ColumnTransformer([
# ('trf1',SimpleImputer(strategy='constant'),[0,1])
# ],remainder='passthrough')

# col_tra_2=ColumnTransformer([
# ('ohe1',OneHotEncoder(sparse=False, handle_unknown='ignore'),[0,3])
# ],remainder='passthrough')

In [53]:
from sklearn.preprocessing import OrdinalEncoder

# categorical procesing pipeline
categorical_processor=Pipeline(
    steps=[("ord_enc",OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
           ('stdscaler', StandardScaler(with_mean=False))]
)

categorical_processor

Pipeline(steps=[('ord_enc',
                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                unknown_value=-1)),
                ('stdscaler', StandardScaler(with_mean=False))])

In [54]:
# combine processing technqiues
from sklearn.compose import ColumnTransformer

preprocessor=ColumnTransformer(
    [("categorical",categorical_processor,[3,4, 5, 7, 8, 12, 13, 14,15,16,17,18,24,25,26]),
    ("numerical",numeric_processor,[0,1,2,6,9,10,11,19,20,21,22,23,27,28,29])]
)

BidsPortfolioManager 0
BidsApi 1
BidsManual 2
NewCreditCustomer 3
VerificationType 4
LanguageCode 5
Age 6
Gender 7
Country 8
AppliedAmount 9
Interest 10
MonthlyPayment 11
UseOfLoan 12
Education 13
MaritalStatus 14
EmploymentStatus 15
EmploymentDurationCurrentEmployer 16
OccupationArea 17
HomeOwnershipType 18
IncomeTotal 19
ExistingLiabilities 20
RefinanceLiabilities 21
DebtToIncome 22
FreeCash 23
Rating 24
Restructured 25
CreditScoreEsMicroL 26
PrincipalPaymentsMade 27
InterestAndPenaltyPaymentsMade 28
PreviousRepaymentsBeforeLoan 29

In [55]:
X.dtypes

BidsPortfolioManager                 float64
BidsApi                              float64
BidsManual                           float64
NewCreditCustomer                     object
VerificationType                      object
LanguageCode                          object
Age                                    int64
Gender                                object
Country                               object
AppliedAmount                        float64
Interest                             float64
MonthlyPayment                       float64
UseOfLoan                             object
Education                             object
MaritalStatus                         object
EmploymentStatus                      object
EmploymentDurationCurrentEmployer     object
OccupationArea                        object
HomeOwnershipType                     object
IncomeTotal                          float64
ExistingLiabilities                    int64
RefinanceLiabilities                   int64
DebtToInco

In [56]:
X.columns

Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer',
       'VerificationType', 'LanguageCode', 'Age', 'Gender', 'Country',
       'AppliedAmount', 'Interest', 'MonthlyPayment', 'UseOfLoan', 'Education',
       'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'RefinanceLiabilities', 'DebtToIncome', 'FreeCash', 'Rating',
       'Restructured', 'CreditScoreEsMicroL', 'PrincipalPaymentsMade',
       'InterestAndPenaltyPaymentsMade', 'PreviousRepaymentsBeforeLoan'],
      dtype='object')

In [57]:
# combine processing technqiues
# from sklearn.compose import ColumnTransformer

# preprocessor=ColumnTransformer(
#     [("categorical",categorical_processor,["VerificationType", "LanguageCode", "Gender", "Country", "UseOfLoan", "Education", "MaritalStatus",
#                                            "EmploymentStatus", "EmploymentDurationCurrentEmployer","OccupationArea", "HomeOwnershipType",
#                                            "Rating", "CreditScoreEsMicroL"]),
#     ("numerical",numeric_processor,["BidsPortfolioManager", "BidsApi", "BidsManual", "NewCreditCustomer", "Age", "AppliedAmount",
#                                     "Interest", "MonthlyPayment", "IncomeTotal", "ExistingLiabilities", "RefinanceLiabilities",
#                                     "DebtToIncome", "FreeCash", "Restructured", "PrincipalPaymentsMade", "InterestAndPenaltyPaymentsMade",
#                                     "PreviousRepaymentsBeforeLoan"])]
# )

In [58]:
catg_features = [col_name for col_name in X.columns if X[col_name].dtype == 'O']
print("Number of Categorical Features: ",format(len(catg_features)))
print("Categorical Features: {}",catg_features)

Number of Categorical Features:  15
Categorical Features: {} ['NewCreditCustomer', 'VerificationType', 'LanguageCode', 'Gender', 'Country', 'UseOfLoan', 'Education', 'MaritalStatus', 'EmploymentStatus', 'EmploymentDurationCurrentEmployer', 'OccupationArea', 'HomeOwnershipType', 'Rating', 'Restructured', 'CreditScoreEsMicroL']


In [59]:
num_features = [col_name for col_name in X.columns if X[col_name].dtype != 'O']
print("Number of Numerical Features: ",format(len(num_features)))
print("Numerical Features: {}",num_features)

Number of Numerical Features:  15
Numerical Features: {} ['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'Age', 'AppliedAmount', 'Interest', 'MonthlyPayment', 'IncomeTotal', 'ExistingLiabilities', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash', 'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade', 'PreviousRepaymentsBeforeLoan']


---

In [60]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.linear_model import Ridge

In [61]:
pca = PCA(n_components=30)
svc_classifier = SVC(kernel='rbf')
regressor = Ridge(random_state=0)


## Classification Pipeline

In [62]:
# Create Pipeline
pipeline_class = make_pipeline(preprocessor,pca,svc_classifier)

# fit and transform the pipeline
pipeline_class.fit(X_train, y_class_train)

# predict using the pipeline
pred_class = pipeline_class.predict(X_test)

In [63]:
print("Support Vector Classifier:")

print("\nAccuracy score:\n", round(accuracy_score(y_class_test, pred_class)*100,2), '%')
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_class_test, pred_class))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_class_test, pred_class,
                                        target_names=['Default', 'NotDefault']))

Support Vector Classifier:

Accuracy score:
 84.42 %
****************************************

Confusion Matrix:
 [[7289 1381]
 [1634 9045]]
****************************************

Classification Report:
               precision    recall  f1-score   support

     Default       0.82      0.84      0.83      8670
  NotDefault       0.87      0.85      0.86     10679

    accuracy                           0.84     19349
   macro avg       0.84      0.84      0.84     19349
weighted avg       0.84      0.84      0.84     19349



## Regression Pipeline

In [64]:
pipeline_reg =make_pipeline(preprocessor,pca,regressor)

# fit and transform the pipeline
pipeline_reg.fit(X_train, y_reg_train)

# predict using the pipeline
pred_reg = pipeline_reg.predict(X_test)

---

## saving model

In [65]:
import pickle
pickle.dump(pipeline_class, open('pipeline_class2.pkl', 'wb'))
pickle.dump(pipeline_reg, open('pipeline_reg2.pkl', 'wb'))

In [66]:
input=X_test.iloc[0].values

In [67]:
input=input.reshape(1,-1)

In [68]:
input

array([[1176.0, 0.0, 263.0, 'True', 'Income and expenses verified',
        'Finnish', 59, 'Male', 'FI', 2125.0, 59.73, 118.92, 'Not Set',
        'Primary education', 'Not Specified', 'Not present', 'Retiree',
        'Not present', 'Tenant,pre-furnished property', 1600.0, 0, 0,
        0.0, 0.0, 'HR', 'False', 'M', 0.0, 0.0, 861.14]], dtype=object)

In [69]:
model1=pickle.load(open('pipeline_class2.pkl','rb'))

In [70]:
int(model1.predict(input))

1

In [71]:
# import json
# columns = {
#     'data_columns' : [col.lower() for col in X.columns]
# }
# with open("columns.json","w") as f:
#     f.write(json.dumps(columns))

---