In [1]:
# Data Wrangling 
import numpy as np
import pandas as pd

## Model evaluators
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

## Models to be used
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Remove unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the training data

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [3]:
# Loading the test data

df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,0x160a,CUS_0xd40,September,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,2022.0,Good,809.98,35.030402,22 Years and 9 Months,No,49.574949,236.64268203272132,Low_spent_Small_value_payments,186.26670208571767
1,0x160b,CUS_0xd40,October,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.053114,22 Years and 10 Months,No,49.574949,21.465380264657146,High_spent_Medium_value_payments,361.444003853782
2,0x160c,CUS_0xd40,November,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.811894,,No,49.574949,148.23393788500923,Low_spent_Medium_value_payments,264.67544623343
3,0x160d,CUS_0xd40,December,Aaron Maashoh,24_,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,32.430559,23 Years and 0 Months,No,49.574949,39.08251089460281,High_spent_Medium_value_payments,343.82687322383634
4,0x1616,CUS_0x21b1,September,Rick Rothackerj,28,004-07-5839,_______,34847.84,3037.986667,2,...,5.0,Good,605.03,25.926822,27 Years and 3 Months,No,18.816215,39.684018417945296,High_spent_Large_value_payments,485.2984336755923


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        50000 non-null  object 
 1   Customer_ID               50000 non-null  object 
 2   Month                     50000 non-null  object 
 3   Name                      44985 non-null  object 
 4   Age                       50000 non-null  object 
 5   SSN                       50000 non-null  object 
 6   Occupation                50000 non-null  object 
 7   Annual_Income             50000 non-null  object 
 8   Monthly_Inhand_Salary     42502 non-null  float64
 9   Num_Bank_Accounts         50000 non-null  int64  
 10  Num_Credit_Card           50000 non-null  int64  
 11  Interest_Rate             50000 non-null  int64  
 12  Num_of_Loan               50000 non-null  object 
 13  Type_of_Loan              44296 non-null  object 
 14  Delay_

## Data Cleaning

In [6]:
# Checking the missing values in training data
df.isnull().sum()

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64

In [8]:
# Checking the missing values in test data
df_test.isnull().sum()

Month                          0
Age                            0
Occupation                     0
Annual_Income                  0
Monthly_Inhand_Salary       7498
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                5704
Delay_from_due_date            0
Num_of_Delayed_Payment      3498
Changed_Credit_Limit           0
Num_Credit_Inquiries        1035
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     2271
Monthly_Balance              562
dtype: int64

In [9]:
df['Payment_of_Min_Amount'].value_counts()
df_test['Payment_of_Min_Amount'].value_counts()

Yes    26158
No     17849
NM      5993
Name: Payment_of_Min_Amount, dtype: int64

In [None]:
# Dropping irrelevant columns from both datasets
df.drop(columns=['SSN', 'Name', 'ID', 'Customer_ID', 'Credit_History_Age', 'Payment_Behaviour'], axis=1, inplace=True)
df_test.drop(columns=['SSN', 'Name', 'ID', 'Customer_ID', 'Credit_History_Age', 'Payment_Behaviour'], axis=1, inplace=True)

In [11]:
# Creating a training categorical columns list to sort wrong amd missing entries like '_' etc
categorical_cols = [c for c in df.columns if df[c].dtype == 'object']
categorical_cols

['Month',
 'Age',
 'Occupation',
 'Annual_Income',
 'Num_of_Loan',
 'Type_of_Loan',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Credit_Mix',
 'Outstanding_Debt',
 'Payment_of_Min_Amount',
 'Amount_invested_monthly',
 'Monthly_Balance',
 'Credit_Score']

In [12]:
# Replacing the wrong and missing entries with np.nan and change the necessary datatype in training data
for col in categorical_cols:
    df[col] = df[col].str.strip('_')

    try:
        df[col] = df[col].astype('float64')
    except:
        df[col] = df[col]
        

for col in categorical_cols:
    df[col] = df[col].replace({'':np.nan})

    try:
        df[col] = df[col].astype('float64')
    except:
        df[col] = df[col]

for col in categorical_cols:
    df[col] = df[col].replace({'!@9#%8':np.nan, '#F%$D@*&8':np.nan})

In [13]:
# Creating a test categorical columns list to sort wrong amd missing entries like '_' etc
categorical_cols = [c for c in df_test.columns if df_test[c].dtype == 'object']
categorical_cols

['Month',
 'Age',
 'Occupation',
 'Annual_Income',
 'Num_of_Loan',
 'Type_of_Loan',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Credit_Mix',
 'Outstanding_Debt',
 'Payment_of_Min_Amount',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [14]:
# Replacing the wrong and missing entries with np.nan and change the necessary datatype in test data

for col in categorical_cols:
    df_test[col] = df_test[col].str.strip('_')

    try:
        df_test[col] = df_test[col].astype('float64')
    except:
        df_test[col] = df_test[col]
        

for col in categorical_cols:
    df_test[col] = df_test[col].replace({'':np.nan})

    try:
        df_test[col] = df_test[col].astype('float64')
    except:
        df_test[col] = df_test[col]

for col in categorical_cols:
    df_test[col] = df_test[col].replace({'!@9#%8':np.nan, '#F%$D@*&8':np.nan})

In [15]:
# Checking the data type to confirm the datatype change
df[categorical_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Month                    100000 non-null  object 
 1   Age                      100000 non-null  float64
 2   Occupation               92938 non-null   object 
 3   Annual_Income            100000 non-null  float64
 4   Num_of_Loan              100000 non-null  float64
 5   Type_of_Loan             88592 non-null   object 
 6   Num_of_Delayed_Payment   92998 non-null   float64
 7   Changed_Credit_Limit     97909 non-null   float64
 8   Credit_Mix               79805 non-null   object 
 9   Outstanding_Debt         100000 non-null  float64
 10  Payment_of_Min_Amount    100000 non-null  object 
 11  Amount_invested_monthly  95521 non-null   float64
 12  Monthly_Balance          97132 non-null   float64
dtypes: float64(8), object(5)
memory usage: 9.9+ MB


In [16]:
# Checking the data type to confirm the datatype change
df_test[categorical_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Month                    50000 non-null  object 
 1   Age                      50000 non-null  float64
 2   Occupation               46562 non-null  object 
 3   Annual_Income            50000 non-null  float64
 4   Num_of_Loan              50000 non-null  float64
 5   Type_of_Loan             44296 non-null  object 
 6   Num_of_Delayed_Payment   46502 non-null  float64
 7   Changed_Credit_Limit     48941 non-null  float64
 8   Credit_Mix               40195 non-null  object 
 9   Outstanding_Debt         50000 non-null  float64
 10  Payment_of_Min_Amount    50000 non-null  object 
 11  Amount_invested_monthly  47729 non-null  float64
 12  Monthly_Balance          49438 non-null  float64
dtypes: float64(8), object(5)
memory usage: 5.0+ MB


In [17]:
# Checking new missing values
df.isnull().sum()

Month                           0
Age                             0
Occupation                   7062
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit         2091
Num_Credit_Inquiries         1965
Credit_Mix                  20195
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Monthly_Balance              2868
Credit_Score                    0
dtype: int64

In [18]:
df_test.isnull().sum()

Month                          0
Age                            0
Occupation                  3438
Annual_Income                  0
Monthly_Inhand_Salary       7498
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                5704
Delay_from_due_date            0
Num_of_Delayed_Payment      3498
Changed_Credit_Limit        1059
Num_Credit_Inquiries        1035
Credit_Mix                  9805
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     2271
Monthly_Balance              562
dtype: int64

In [19]:
df.describe()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance
count,100000.0,100000.0,84998.0,100000.0,100000.0,100000.0,100000.0,100000.0,92998.0,97909.0,98035.0,100000.0,100000.0,100000.0,95521.0,97132.0
mean,110.6497,176415.7,4194.17085,17.09128,22.47443,72.46604,3.00996,21.06878,30.923342,10.389025,27.754251,1426.220376,32.285173,1403.118217,637.412998,-3.08858e+22
std,686.244717,1429618.0,3183.686167,117.404834,129.05741,466.422621,62.647879,14.860104,226.031892,6.789496,193.177339,1155.129026,5.116875,8306.04127,2043.319327,3.208492e+24
min,-500.0,7005.93,303.645417,-1.0,0.0,1.0,-100.0,-5.0,-3.0,-6.49,0.0,0.23,20.0,0.0,0.0,-3.333333e+26
25%,24.0,19457.5,1625.568229,3.0,4.0,8.0,1.0,10.0,9.0,5.32,3.0,566.0725,28.052567,30.30666,74.534002,269.9858
50%,33.0,37578.61,3093.745,6.0,5.0,13.0,3.0,18.0,14.0,9.4,6.0,1166.155,32.305784,69.249473,135.925682,336.4493
75%,42.0,72790.92,5957.448333,7.0,7.0,20.0,5.0,28.0,18.0,14.87,9.0,1945.9625,36.496663,161.224249,265.731733,469.6405
max,8698.0,24198060.0,15204.633333,1798.0,1499.0,5797.0,1496.0,67.0,4397.0,36.97,2597.0,4998.07,50.0,82331.0,10000.0,1602.041


In [20]:
df_test.describe()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance
count,50000.0,50000.0,42502.0,50000.0,50000.0,50000.0,50000.0,50000.0,46502.0,48941.0,48965.0,50000.0,50000.0,50000.0,47729.0,49438.0
mean,109.71442,166334.2,4182.004291,16.83826,22.92148,68.77264,3.40336,21.05264,30.888951,10.374844,30.0802,1426.220376,32.279581,1491.304305,641.654795,-4.045471e+22
std,679.696381,1351965.0,3174.109304,116.396848,129.314804,451.602363,66.364023,14.860397,221.510461,6.780628,196.984121,1155.134801,5.106238,8595.647887,2053.89542,3.671994e+24
min,-500.0,7005.93,303.645417,-1.0,0.0,1.0,-100.0,-5.0,-3.0,-6.45,0.0,0.23,20.509652,0.0,0.0,-3.333333e+26
25%,25.0,19453.33,1625.188333,3.0,4.0,8.0,1.0,10.0,9.0,5.34,4.0,566.0725,28.06104,32.222388,74.52927,270.4741
50%,34.0,37577.82,3086.305,6.0,5.0,13.0,3.0,18.0,14.0,9.41,7.0,1166.155,32.28039,74.733349,135.59043,336.9732
75%,43.0,72817.02,5934.189094,7.0,7.0,20.0,5.0,28.0,18.0,14.8,10.0,1945.9625,36.468591,176.157491,266.892228,470.8568
max,8688.0,24137260.0,15204.633333,1798.0,1499.0,5799.0,1496.0,67.0,4399.0,36.65,2593.0,4998.07,48.540663,82398.0,10000.0,1606.518


From the describe() we can see that there are some irregular data entries like 'Age' having -500 and someone having 1,798 Num_Bank_Accounts which shows that there are outliers which must be solved 

In [21]:
#Dealing with training data Outliers
numerical_cols = ['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Num_Credit_Inquiries', 'Delay_from_due_date', 'Changed_Credit_Limit']

for x in list(numerical_cols):
    q75,q25 = np.percentile(df.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df.loc[df[x] < min,x] = np.nan
    df.loc[df[x] > max,x] = np.nan

In [22]:
#Dealing with test data Outliers
numerical_cols = ['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Num_Credit_Inquiries', 'Delay_from_due_date', 'Changed_Credit_Limit']

for x in list(numerical_cols):
    q75,q25 = np.percentile(df_test.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_test.loc[df_test[x] < min,x] = np.nan
    df_test.loc[df_test[x] > max,x] = np.nan

In [23]:
df.isnull().sum()

Month                           0
Age                          2781
Occupation                   7062
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts            1315
Num_Credit_Card              2271
Interest_Rate                2034
Num_of_Loan                  4348
Type_of_Loan                11408
Delay_from_due_date          4002
Num_of_Delayed_Payment       7002
Changed_Credit_Limit         2091
Num_Credit_Inquiries         1965
Credit_Mix                  20195
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Monthly_Balance              2868
Credit_Score                    0
dtype: int64

In [24]:
df_test.isnull().sum()

Month                          0
Age                         1403
Occupation                  3438
Annual_Income                  0
Monthly_Inhand_Salary       7498
Num_Bank_Accounts            635
Num_Credit_Card             1179
Interest_Rate                966
Num_of_Loan                 2232
Type_of_Loan                5704
Delay_from_due_date         1996
Num_of_Delayed_Payment      3498
Changed_Credit_Limit        1059
Num_Credit_Inquiries        1035
Credit_Mix                  9805
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     2271
Monthly_Balance              562
dtype: int64

In [25]:
# Finding the mean value of the column - Monthly_Inhand_Salary in the dataset using Credit_Score

salary_good_mean = np.mean(df[df['Credit_Score'] == 'Good']['Monthly_Inhand_Salary'])
salary_poor_mean = np.mean(df[df['Credit_Score'] == 'Poor']['Monthly_Inhand_Salary'])
salary_standard_mean = np.mean(df[df['Credit_Score'] == 'Standard']['Monthly_Inhand_Salary'])

(salary_good_mean, salary_poor_mean, salary_standard_mean)


(5389.246647164289, 3375.3116741854924, 4242.03705120616)

In [26]:
# Replacing the missing values in the column Monthly_Inhand_Salary using the decision logic

index_values = list(df['Monthly_Inhand_Salary'].isnull())


for index in range(len(df)):
    if index_values[index]:
        if df['Credit_Score'][index] == 'Good':
            df['Monthly_Inhand_Salary'][index] = salary_good_mean
        elif df['Credit_Score'][index] == 'Poor':
            df['Monthly_Inhand_Salary'][index] = salary_poor_mean
        else:
            df['Monthly_Inhand_Salary'][index] = salary_standard_mean

In [27]:
# Using mean to fill the missing values in training data

df['Amount_invested_monthly'].fillna(df['Amount_invested_monthly'].mean(), inplace = True)
df['Monthly_Balance'].fillna(df['Monthly_Balance'].mean(), inplace = True)

In [28]:
# Using mean to fill the missing values in test data

df_test['Monthly_Inhand_Salary'].fillna(df_test['Monthly_Inhand_Salary'].mean(), inplace = True)
df_test['Amount_invested_monthly'].fillna(df_test['Amount_invested_monthly'].mean(), inplace = True)
df_test['Monthly_Balance'].fillna(df_test['Monthly_Balance'].mean(), inplace = True)

In [29]:
# Using median to fill the missing values in training data

df['Age'].fillna(df['Age'].median(), inplace = True) 
df['Num_Bank_Accounts'].fillna(df['Num_Bank_Accounts'].median(), inplace = True)
df['Num_Credit_Card'].fillna(df['Num_Credit_Card'].median(), inplace = True)
df['Interest_Rate'].fillna(df['Interest_Rate'].median(), inplace = True) 
df['Num_of_Loan'].fillna(df['Num_of_Loan'].median(), inplace = True)
df['Delay_from_due_date'].fillna(df['Delay_from_due_date'].median(), inplace = True)
df['Num_of_Delayed_Payment'].fillna(df['Num_of_Delayed_Payment'].median(), inplace = True) 
df['Changed_Credit_Limit'].fillna(df['Changed_Credit_Limit'].median(), inplace = True)
df['Num_Credit_Inquiries'].fillna(df['Num_Credit_Inquiries'].median(), inplace = True)

In [30]:
# Using median to fill the missing values in test data

df_test['Age'].fillna(df_test['Age'].median(), inplace = True) 
df_test['Num_Bank_Accounts'].fillna(df_test['Num_Bank_Accounts'].median(), inplace = True)
df_test['Num_Credit_Card'].fillna(df_test['Num_Credit_Card'].median(), inplace = True)
df_test['Interest_Rate'].fillna(df_test['Interest_Rate'].median(), inplace = True) 
df_test['Num_of_Loan'].fillna(df_test['Num_of_Loan'].median(), inplace = True)
df_test['Delay_from_due_date'].fillna(df_test['Delay_from_due_date'].median(), inplace = True)
df_test['Num_of_Delayed_Payment'].fillna(df_test['Num_of_Delayed_Payment'].median(), inplace = True) 
df_test['Changed_Credit_Limit'].fillna(df_test['Changed_Credit_Limit'].median(), inplace = True)
df_test['Num_Credit_Inquiries'].fillna(df_test['Num_Credit_Inquiries'].median(), inplace = True)

In [31]:
df['Credit_Mix'].fillna(df['Credit_Mix'].value_counts().idxmax(),inplace=True)
df_test['Credit_Mix'].fillna(df_test['Credit_Mix'].value_counts().idxmax(),inplace=True)

### Changing object datatype to Category for both training and test data

In [32]:
df['Occupation'] = pd.Categorical(df['Occupation'])
Occupation_categories = df['Occupation'].cat.categories
df['Occupation'] = df['Occupation'].cat.codes+1

In [33]:
df_test['Occupation'] = pd.Categorical(df_test['Occupation'])
Occupation_categories = df_test['Occupation'].cat.categories
df_test['Occupation'] = df_test['Occupation'].cat.codes+1

In [34]:
df['Credit_Mix'] = pd.Categorical(df['Credit_Mix'])
Credit_Mix_categories = df['Credit_Mix'].cat.categories
df['Credit_Mix'] = df['Credit_Mix'].cat.codes+1

In [35]:
df_test['Credit_Mix'] = pd.Categorical(df_test['Credit_Mix'])
Credit_Mix_categories = df_test['Credit_Mix'].cat.categories
df_test['Credit_Mix'] = df_test['Credit_Mix'].cat.codes+1

In [36]:
df['Payment_of_Min_Amount'] = pd.Categorical(df['Payment_of_Min_Amount'])
Payment_of_Min_Amount_categories = df['Payment_of_Min_Amount'].cat.categories
df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].cat.codes+1

In [37]:
df_test['Payment_of_Min_Amount'] = pd.Categorical(df_test['Payment_of_Min_Amount'])
Payment_of_Min_Amount_categories = df_test['Payment_of_Min_Amount'].cat.categories
df_test['Payment_of_Min_Amount'] = df_test['Payment_of_Min_Amount'].cat.codes+1

In [38]:
df['Month'] = pd.Categorical(df['Month'])
Month_categories = df['Month'].cat.categories
df['Month'] = df['Month'].cat.codes+1

In [39]:
df_test['Month'] = pd.Categorical(df_test['Month'])
Month_categories = df_test['Month'].cat.categories
df_test['Month'] = df_test['Month'].cat.codes+1

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     100000 non-null  int8   
 1   Age                       100000 non-null  float64
 2   Occupation                100000 non-null  int8   
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     100000 non-null  float64
 5   Num_Bank_Accounts         100000 non-null  float64
 6   Num_Credit_Card           100000 non-null  float64
 7   Interest_Rate             100000 non-null  float64
 8   Num_of_Loan               100000 non-null  float64
 9   Type_of_Loan              88592 non-null   object 
 10  Delay_from_due_date       100000 non-null  float64
 11  Num_of_Delayed_Payment    100000 non-null  float64
 12  Changed_Credit_Limit      100000 non-null  float64
 13  Num_Credit_Inquiries      100000 non-null  fl

In [41]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     50000 non-null  int8   
 1   Age                       50000 non-null  float64
 2   Occupation                50000 non-null  int8   
 3   Annual_Income             50000 non-null  float64
 4   Monthly_Inhand_Salary     50000 non-null  float64
 5   Num_Bank_Accounts         50000 non-null  float64
 6   Num_Credit_Card           50000 non-null  float64
 7   Interest_Rate             50000 non-null  float64
 8   Num_of_Loan               50000 non-null  float64
 9   Type_of_Loan              44296 non-null  object 
 10  Delay_from_due_date       50000 non-null  float64
 11  Num_of_Delayed_Payment    50000 non-null  float64
 12  Changed_Credit_Limit      50000 non-null  float64
 13  Num_Credit_Inquiries      50000 non-null  float64
 14  Credit

In [42]:
# Fetching the not null data of the column for training data - Type of Loan

index_values = ~df['Type_of_Loan'].isnull().values
loan_type_data = list(df['Type_of_Loan'][index_values])

In [43]:
# Create a dictionary to store the counts of all the various loan types for training data

loan_type_dict = dict()
for value in loan_type_data:
    values = value.split(',')
    for each_value in values:
        loan_type = each_value.strip(' ')
        if 'and' in loan_type:
            loan_type = loan_type[4 : ]
        if loan_type in loan_type_dict:
            loan_type_dict[loan_type] += 1
        else:
            loan_type_dict[loan_type] = 1

loan_type_dict

{'Auto Loan': 37992,
 'Credit-Builder Loan': 40440,
 'Personal Loan': 38888,
 'Home Equity Loan': 39104,
 'Not Specified': 39616,
 'Mortgage Loan': 38936,
 'Student Loan': 38968,
 'Debt Consolidation Loan': 38776,
 'Payday Loan': 40568}

In [44]:
# Creating 8 different lists for each loan type in training dataset

auto_loan = [0] * (len(df))
credit_builder_loan = [0] * (len(df))
personal_loan = [0] * (len(df))
home_equity_loan = [0] * (len(df))
mortgage_loan = [0] * (len(df))
student_loan = [0] * (len(df))
debt_consolidation_loan = [0] * (len(df))
payday_loan = [0] * (len(df))

In [45]:
# Using 0's and 1's if a customer has a particular loan

for index in range(len(loan_type_data)):
    # For Auto Loan
    if 'Auto' in loan_type_data[index]:
        auto_loan[index] = 1
    
    # For Credit Builder Loan
    if 'Credit-Builder' in loan_type_data[index]:
        credit_builder_loan[index] = 1
        
    # For Personal Loan
    if 'Personal' in loan_type_data[index]:
        personal_loan[index] = 1
    
    # For Home Equity Loan
    if 'Home' in loan_type_data[index]:
        home_equity_loan[index] = 1
    
    # For Mortgage Loan
    if 'Mortgage' in loan_type_data[index]:
        mortgage_loan[index] = 1
    
    # For Student Loan
    if 'Student' in loan_type_data[index]:
        student_loan[index] = 1
        
    # For Debt Consolidation loan
    if 'Debt' in loan_type_data[index]:
        debt_consolidation_loan[index] = 1
    
    # For Payday loan
    if 'Payday' in loan_type_data[index]:
        payday_loan[index] = 1

In [46]:
# Adding the new columns to the training dataset

df['Auto_Loan'] = auto_loan
df['Credit_Builder_Loan'] = credit_builder_loan
df['Personal_Loan'] = personal_loan
df['Home_Enquity_Loan'] = home_equity_loan
df['Mortgage_Loan'] = mortgage_loan
df['Student_Loan'] = student_loan
df['Debt_Consolidation_Loan'] = debt_consolidation_loan
df['Payday_Loan'] = payday_loan

#### For Test data

In [47]:
# Fetching the not null data of the column for test data - Type of Data 

index_values_test = ~df_test['Type_of_Loan'].isnull().values
loan_type_test = list(df_test['Type_of_Loan'][index_values_test])

In [48]:
# Create a dictionary to store the counts of all the various loan types for test data

loan_type_test_dict = dict()
for value in loan_type_test:
    values = value.split(',')
    for each_values in values:
        loan_types = each_values.strip(' ')
        if 'and' in loan_types:
            loan_types = loan_types[4 : ]
        if loan_types in loan_type_test_dict:
            loan_type_test_dict[loan_types] += 1
        else:
            loan_type_test_dict[loan_types] = 1

loan_type_test_dict

{'Auto Loan': 18996,
 'Credit-Builder Loan': 20220,
 'Personal Loan': 19444,
 'Home Equity Loan': 19552,
 'Not Specified': 19808,
 'Mortgage Loan': 19468,
 'Student Loan': 19484,
 'Debt Consolidation Loan': 19388,
 'Payday Loan': 20284}

In [49]:
# Creating 8 different lists for each loan type in test data

auto_loan = [0] * (len(df_test))
credit_builder_loan = [0] * (len(df_test))
personal_loan = [0] * (len(df_test))
home_equity_loan = [0] * (len(df_test))
mortgage_loan = [0] * (len(df_test))
student_loan = [0] * (len(df_test))
debt_consolidation_loan = [0] * (len(df_test))
payday_loan = [0] * (len(df_test))

In [50]:
# Using 0's and 1's if a customer has a particular loan

for index in range(len(loan_type_test)):
    # For Auto Loan
    if 'Auto' in loan_type_test[index]:
        auto_loan[index] = 1
    
    # For Credit Builder Loan
    if 'Credit-Builder' in loan_type_test[index]:
        credit_builder_loan[index] = 1
        
    # For Personal Loan
    if 'Personal' in loan_type_test[index]:
        personal_loan[index] = 1
    
    # For Home Equity Loan
    if 'Home' in loan_type_test[index]:
        home_equity_loan[index] = 1
    
    # For Mortgage Loan
    if 'Mortgage' in loan_type_test[index]:
        mortgage_loan[index] = 1
    
    # For Student Loan
    if 'Student' in loan_type_test[index]:
        student_loan[index] = 1
        
    # For Debt Consolidation loan
    if 'Debt' in loan_type_test[index]:
        debt_consolidation_loan[index] = 1
    
    # For Payday loan
    if 'Payday' in loan_type_test[index]:
        payday_loan[index] = 1

In [51]:
# Adding the new columns to the test dataset

df_test['Auto_Loan'] = auto_loan
df_test['Credit_Builder_Loan'] = credit_builder_loan
df_test['Personal_Loan'] = personal_loan
df_test['Home_Enquity_Loan'] = home_equity_loan
df_test['Mortgage_Loan'] = mortgage_loan
df_test['Student_Loan'] = student_loan
df_test['Debt_Consolidation_Loan'] = debt_consolidation_loan
df_test['Payday_Loan'] = payday_loan

In [52]:
# Removing the columns in both training and test data - Type_of_loan

df.drop(['Type_of_Loan'], axis = 1, inplace = True)
df_test.drop(['Type_of_Loan'], axis = 1, inplace = True)

In [53]:
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Monthly_Balance,Credit_Score,Auto_Loan,Credit_Builder_Loan,Personal_Loan,Home_Enquity_Loan,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan
0,4,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,312.494089,Good,1,1,1,1,0,0,0,0
1,3,23.0,13,19114.12,5389.246647,3.0,4.0,3.0,4.0,-1.0,...,284.629162,Good,1,1,1,1,0,0,0,0
2,7,33.0,13,19114.12,5389.246647,3.0,4.0,3.0,4.0,3.0,...,331.209863,Good,1,1,1,1,0,0,0,0
3,1,23.0,13,19114.12,5389.246647,3.0,4.0,3.0,4.0,5.0,...,223.45131,Good,1,1,1,1,0,0,0,0
4,8,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,...,341.489231,Good,1,1,1,1,0,0,0,0


In [54]:
df_test.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Amount_invested_monthly,Monthly_Balance,Auto_Loan,Credit_Builder_Loan,Personal_Loan,Home_Enquity_Loan,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan
0,4,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,236.642682,186.266702,1,1,1,1,0,0,0,0
1,3,24.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,21.46538,361.444004,1,1,1,1,0,0,0,0
2,2,24.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,-1.0,...,148.233938,264.675446,1,1,1,1,0,0,0,0
3,1,24.0,13,19114.12,4182.004291,3.0,4.0,3.0,4.0,4.0,...,39.082511,343.826873,1,1,1,1,0,0,0,0
4,4,28.0,0,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,...,39.684018,485.298434,0,1,0,0,0,0,0,0


In [55]:
df.describe()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Amount_invested_monthly,Monthly_Balance,Auto_Loan,Credit_Builder_Loan,Personal_Loan,Home_Enquity_Loan,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,4.5,33.31111,7.39346,176415.7,4195.228553,5.36279,5.52121,14.50045,3.51055,19.37588,...,637.412998,-3.08858e+22,0.3056,0.31728,0.31104,0.314,0.3136,0.3104,0.3104,0.31944
std,2.291299,10.61888,4.63016,1429618.0,2946.870586,2.577068,2.045632,8.653109,2.395985,12.644939,...,1997.034517,3.162147e+24,0.460663,0.46542,0.462921,0.464119,0.463958,0.46266,0.46266,0.466262
min,1.0,14.0,0.0,7005.93,303.645417,-1.0,0.0,1.0,0.0,-5.0,...,0.0,-3.333333e+26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.75,25.0,3.0,19457.5,1792.084167,3.0,4.0,8.0,2.0,10.0,...,77.017414,264.5521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.5,33.0,7.0,37578.61,3375.311674,5.0,5.0,13.0,3.0,17.0,...,143.127915,331.9282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.25,41.0,11.0,72790.92,5389.246647,7.0,7.0,20.0,5.0,26.0,...,304.7656,463.5022,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,8.0,56.0,15.0,24198060.0,15204.633333,11.0,11.0,34.0,9.0,55.0,...,10000.0,1602.041,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
df_test.describe()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Amount_invested_monthly,Monthly_Balance,Auto_Loan,Credit_Builder_Loan,Personal_Loan,Home_Enquity_Loan,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2.5,33.8006,7.39722,166334.2,4182.004291,5.36276,5.52008,14.50196,3.51154,19.36524,...,641.654795,-4.045471e+22,0.3056,0.31728,0.31104,0.314,0.3136,0.3104,0.3104,0.31944
std,1.118045,10.617942,4.617101,1351965.0,2926.447876,2.57905,2.049507,8.661373,2.393521,12.653706,...,2006.708477,3.651299e+24,0.460666,0.465422,0.462924,0.464121,0.46396,0.462662,0.462662,0.466264
min,1.0,14.0,0.0,7005.93,303.645417,-1.0,0.0,1.0,0.0,-5.0,...,0.0,-3.333333e+26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.75,25.0,3.0,19453.33,1794.304167,3.0,4.0,8.0,2.0,10.0,...,77.031011,268.319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.5,33.0,7.0,37577.82,3848.6825,5.0,5.0,13.0,3.0,17.0,...,143.245476,335.1782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.25,42.0,11.0,72817.02,5338.9675,7.0,7.0,20.0,5.0,26.0,...,305.878898,468.5712,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,4.0,56.0,15.0,24137260.0,15204.633333,11.0,11.0,34.0,9.0,55.0,...,10000.0,1606.518,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
df.isnull().sum()

Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Monthly_Balance             0
Credit_Score                0
Auto_Loan                   0
Credit_Builder_Loan         0
Personal_Loan               0
Home_Enquity_Loan           0
Mortgage_Loan               0
Student_Loan                0
Debt_Consolidation_Loan     0
Payday_Loan                 0
dtype: int64

In [58]:
df_test.isnull().sum()

Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Monthly_Balance             0
Auto_Loan                   0
Credit_Builder_Loan         0
Personal_Loan               0
Home_Enquity_Loan           0
Mortgage_Loan               0
Student_Loan                0
Debt_Consolidation_Loan     0
Payday_Loan                 0
dtype: int64

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     100000 non-null  int8   
 1   Age                       100000 non-null  float64
 2   Occupation                100000 non-null  int8   
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     100000 non-null  float64
 5   Num_Bank_Accounts         100000 non-null  float64
 6   Num_Credit_Card           100000 non-null  float64
 7   Interest_Rate             100000 non-null  float64
 8   Num_of_Loan               100000 non-null  float64
 9   Delay_from_due_date       100000 non-null  float64
 10  Num_of_Delayed_Payment    100000 non-null  float64
 11  Changed_Credit_Limit      100000 non-null  float64
 12  Num_Credit_Inquiries      100000 non-null  float64
 13  Credit_Mix                100000 non-null  in

In [60]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     50000 non-null  int8   
 1   Age                       50000 non-null  float64
 2   Occupation                50000 non-null  int8   
 3   Annual_Income             50000 non-null  float64
 4   Monthly_Inhand_Salary     50000 non-null  float64
 5   Num_Bank_Accounts         50000 non-null  float64
 6   Num_Credit_Card           50000 non-null  float64
 7   Interest_Rate             50000 non-null  float64
 8   Num_of_Loan               50000 non-null  float64
 9   Delay_from_due_date       50000 non-null  float64
 10  Num_of_Delayed_Payment    50000 non-null  float64
 11  Changed_Credit_Limit      50000 non-null  float64
 12  Num_Credit_Inquiries      50000 non-null  float64
 13  Credit_Mix                50000 non-null  int8   
 14  Outsta

In [61]:
encode=LabelEncoder()
df['Credit_Score'] = encode.fit_transform(df['Credit_Score'])

## Modeling

In [62]:
x = df.drop('Credit_Score', axis = 1)
y = df['Credit_Score']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [64]:
# fit scaler on training data
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

# transform the testing data column
x_test = sc.transform(x_test)


In [65]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_lr = lr.predict(x_test)

In [66]:
lr.score(x_test, y_test)

0.63805

In [67]:
cm = confusion_matrix(y_test, y_lr)
cm

array([[1723,   51, 1818],
       [ 257, 2818, 2718],
       [ 970, 1425, 8220]], dtype=int64)

In [68]:
print(classification_report(y_test, y_lr))

              precision    recall  f1-score   support

           0       0.58      0.48      0.53      3592
           1       0.66      0.49      0.56      5793
           2       0.64      0.77      0.70     10615

    accuracy                           0.64     20000
   macro avg       0.63      0.58      0.60     20000
weighted avg       0.64      0.64      0.63     20000



In [69]:
rand = RandomForestClassifier(n_estimators=300)
rand.fit(x_train, y_train)
y_rand = rand.predict(x_test)

In [70]:
rand.score(x_test, y_test)

0.82045

In [71]:
cm = confusion_matrix(y_test, y_rand)
cm

array([[2678,   15,  899],
       [  37, 4843,  913],
       [ 654, 1073, 8888]], dtype=int64)

In [72]:
print(classification_report(y_test, y_rand))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77      3592
           1       0.82      0.84      0.83      5793
           2       0.83      0.84      0.83     10615

    accuracy                           0.82     20000
   macro avg       0.81      0.81      0.81     20000
weighted avg       0.82      0.82      0.82     20000



In [73]:
dtree = DecisionTreeClassifier(criterion = 'entropy')
dtree.fit(x_train, y_train)
y_dtree = dtree.predict(x_test)

In [74]:
dtree.score(x_test, y_test)

0.73175

In [75]:
cm = confusion_matrix(y_test, y_dtree)
cm

array([[2364,  144, 1084],
       [ 154, 4186, 1453],
       [1055, 1475, 8085]], dtype=int64)

In [76]:
print(classification_report(y_test, y_dtree))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      3592
           1       0.72      0.72      0.72      5793
           2       0.76      0.76      0.76     10615

    accuracy                           0.73     20000
   macro avg       0.71      0.71      0.71     20000
weighted avg       0.73      0.73      0.73     20000



In [78]:
xgb  = XGBClassifier(n_estimators=700)
xgb.fit(x_train, y_train)
y_xgb = xgb.predict(x_test)

In [79]:
xgb.score(x_test, y_test)

0.8242

In [80]:
cm = confusion_matrix(y_test, y_xgb)
cm

array([[2740,   14,  838],
       [  34, 4805,  954],
       [ 646, 1030, 8939]], dtype=int64)

In [81]:
print(classification_report(y_test, y_xgb))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78      3592
           1       0.82      0.83      0.83      5793
           2       0.83      0.84      0.84     10615

    accuracy                           0.82     20000
   macro avg       0.82      0.81      0.81     20000
weighted avg       0.82      0.82      0.82     20000



In [82]:
knn=KNeighborsClassifier(n_neighbors=11, algorithm = 'auto', p = 2, metric = 'minkowski')
knn.fit(x_train, y_train)
knn_prediction=knn.predict(x_test)

In [83]:
knn.score(x_test, y_test)

0.7098

In [84]:
cm = confusion_matrix(y_test, knn_prediction)
cm

array([[2149,   77, 1366],
       [ 286, 3778, 1729],
       [1027, 1319, 8269]], dtype=int64)

In [85]:
print(classification_report(y_test, knn_prediction))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61      3592
           1       0.73      0.65      0.69      5793
           2       0.73      0.78      0.75     10615

    accuracy                           0.71     20000
   macro avg       0.69      0.68      0.68     20000
weighted avg       0.71      0.71      0.71     20000



### Using XGBoost which is the best model to predict Credit score for test data

In [86]:
X_test = df_test

In [87]:
test_prediction = xgb.predict(X_test)

In [88]:
df_test['Credit_Score'] = test_prediction
df_test.head(20)

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Monthly_Balance,Auto_Loan,Credit_Builder_Loan,Personal_Loan,Home_Enquity_Loan,Mortgage_Loan,Student_Loan,Debt_Consolidation_Loan,Payday_Loan,Credit_Score
0,4,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,186.266702,1,1,1,1,0,0,0,0,2
1,3,24.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,361.444004,1,1,1,1,0,0,0,0,2
2,2,24.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,-1.0,...,264.675446,1,1,1,1,0,0,0,0,2
3,1,24.0,13,19114.12,4182.004291,3.0,4.0,3.0,4.0,4.0,...,343.826873,1,1,1,1,0,0,0,0,2
4,4,28.0,0,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,...,485.298434,0,1,0,0,0,0,0,0,2
5,3,28.0,14,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,...,303.355083,0,1,0,0,0,0,0,0,2
6,2,28.0,14,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,...,452.302307,0,1,0,0,0,0,0,0,2
7,1,28.0,14,34847.84,3037.986667,2.0,4.0,6.0,1.0,3.0,...,421.447964,0,1,0,0,0,0,0,0,2
8,4,35.0,5,143162.64,4182.004291,1.0,5.0,8.0,3.0,8.0,...,854.226027,1,0,0,0,0,0,0,0,2
9,3,35.0,5,143162.64,12187.22,1.0,5.0,8.0,3.0,6.0,...,788.11455,1,0,0,0,0,0,0,0,2
