In [504]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
import pickle

In [505]:
df=pd.read_csv('test.csv')

In [506]:
df.isna().sum()

ID                             0
Customer_ID                    0
Month                          0
Name                        5015
Age                            0
SSN                            0
Occupation                     0
Annual_Income                  0
Monthly_Inhand_Salary       7498
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                5704
Delay_from_due_date            0
Num_of_Delayed_Payment      3498
Changed_Credit_Limit           0
Num_Credit_Inquiries        1035
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age          4470
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     2271
Payment_Behaviour              0
Monthly_Balance              562
dtype: int64

In [507]:
df.dropna(inplace=True)

In [508]:
df['Credit_Mix'].value_counts()

Credit_Mix
Standard    9757
Good        5802
Bad         5654
_           5246
Name: count, dtype: int64

In [509]:
df.drop(columns=['ID','Customer_ID','Name','SSN','Type_of_Loan'],inplace=True) # removing useless columns

In [510]:
df['Age']=df['Age'].str.strip('_') # stripping '-' character

In [511]:
df['Age']=df['Age'].astype(float) # converting data type float

In [512]:
df=df[df['Occupation']!='_______'] # removing the values '_______' from the occupation column

In [513]:
df['Annual_Income']=df['Annual_Income'].str.strip('_') # stripping '-' character

In [514]:
df['Annual_Income']=df['Annual_Income'].astype(float) # converting datatype

In [515]:
df['Monthly_Inhand_Salary']=df['Monthly_Inhand_Salary'].astype(float) # converting datatype

In [516]:
df['Num_of_Loan']=df['Num_of_Loan'].str.strip('_') # stripping '-' character

In [517]:
df['Num_of_Loan']=df['Num_of_Loan'].astype(float) # converting datatype

In [518]:
df['Num_of_Delayed_Payment']=df['Num_of_Delayed_Payment'].str.strip('_') # stripping '-' character

In [519]:
df['Num_of_Delayed_Payment']=df['Num_of_Delayed_Payment'].astype(float) # converting datatype

In [520]:
df['Changed_Credit_Limit']=df['Changed_Credit_Limit'].str.strip('_') # stripping '-' character

In [521]:
df=df[df['Changed_Credit_Limit']!=''] # removing blank space from the column 

In [522]:
df['Changed_Credit_Limit']=df['Changed_Credit_Limit'].astype(float) # converting datatype

In [523]:
df=df[df['Credit_Mix']!='_'] # removing '-' character from the columns

In [524]:
df['Outstanding_Debt']=df['Outstanding_Debt'].str.strip('_') # stripping '-' character

In [525]:
df['Outstanding_Debt']=df['Outstanding_Debt'].astype(float) # converting datatype

In [526]:
df=df[df['Payment_of_Min_Amount']!='NM'] # removing irrelavant character from the columns since it is a boolean column

In [527]:
df['Amount_invested_monthly']=df['Amount_invested_monthly'].str.strip('_') # stripping '-' character

In [528]:
df['Amount_invested_monthly']=df['Amount_invested_monthly'].astype(float) # converting datatype

In [529]:
df['Monthly_Balance']=df['Monthly_Balance'].str.strip('_') # stripping '-' character

In [530]:
df['Monthly_Balance']=df['Monthly_Balance'].astype(float) # converting datatype

IQR Method

In [531]:
iqr_dict={}
for i in df:
  if df[i].dtype in [int,float]:
    q1=df[i].quantile(0.25)
    q3=df[i].quantile(0.75)
    iqr=q3-q1
    min_range=q1-(1.5*iqr)
    max_range=q3+(1.5*iqr)
    iqr_dict[i]=(min_range,max_range)
iqr_dict

{'Age': (np.float64(-0.5), np.float64(67.5)),
 'Annual_Income': (np.float64(-57754.87750000001), np.float64(146458.9425)),
 'Monthly_Inhand_Salary': (np.float64(-4620.501249999998),
  np.float64(11893.535416666664)),
 'Num_Bank_Accounts': (np.float64(-2.0), np.float64(14.0)),
 'Num_Credit_Card': (np.float64(-0.5), np.float64(11.5)),
 'Interest_Rate': (np.float64(-13.0), np.float64(43.0)),
 'Num_of_Loan': (np.float64(-4.0), np.float64(12.0)),
 'Delay_from_due_date': (np.float64(-16.0), np.float64(56.0)),
 'Num_of_Delayed_Payment': (np.float64(-6.0), np.float64(34.0)),
 'Changed_Credit_Limit': (np.float64(-9.28), np.float64(30.08)),
 'Num_Credit_Inquiries': (np.float64(-4.0), np.float64(20.0)),
 'Outstanding_Debt': (np.float64(-1669.1175000000003), np.float64(4394.5825)),
 'Credit_Utilization_Ratio': (np.float64(15.468079372165995),
  np.float64(48.89557197052333)),
 'Total_EMI_per_month': (np.float64(-165.3502450625721),
  np.float64(390.0444805438806)),
 'Amount_invested_monthly': (np.

In [532]:
df=df[(df['Age']>0) & (df['Age']<100)] # there are ages which are negative and greater than 100 

In [533]:
df=df[(df['Num_Bank_Accounts']>0) & (df['Num_Bank_Accounts']<11)] # filtering out the relevant values 

In [534]:
df=df[(df['Num_Credit_Card']>0) & (df['Num_Credit_Card']<11)] # filtering out the relevant values

In [535]:
df=df[(df['Interest_Rate']>0) & (df['Interest_Rate']<=iqr_dict['Interest_Rate'][1])] # filtering out the relevant values using iqr method

In [536]:
df=df[(df['Num_of_Loan']>0) & (df['Num_of_Loan']<=12)] # filtering out the relevant values

In [537]:
df=df[df['Delay_from_due_date']>0] # filtering out the relevant values

In [538]:
df=df[(df['Num_of_Delayed_Payment']>=0) & (df['Num_of_Delayed_Payment']<=34)] # filtering out the relevant values

In [539]:
df=df[(df['Num_Credit_Inquiries']>0) & (df['Num_Credit_Inquiries']<=20)] # filtering out the relevant values

In [540]:
# df['Outstanding_Debt'] is in US dollars
df=df[df['Outstanding_Debt']>12] # filtering out the relevant values

In [541]:
df['Credit_History_Age'].unique()

array(['22 Years and 10 Months', '27 Years and 4 Months',
       '18 Years and 6 Months', '17 Years and 11 Months',
       '18 Years and 1 Months', '18 Years and 2 Months',
       '27 Years and 1 Months', '27 Years and 2 Months',
       '19 Years and 10 Months', '27 Years and 6 Months',
       '27 Years and 7 Months', '18 Years and 11 Months',
       '19 Years and 1 Months', '18 Years and 10 Months',
       '19 Years and 0 Months', '10 Years and 9 Months',
       '10 Years and 10 Months', '11 Years and 0 Months',
       '12 Years and 11 Months', '18 Years and 8 Months',
       '18 Years and 9 Months', '14 Years and 4 Months',
       '14 Years and 6 Months', '14 Years and 7 Months',
       '20 Years and 0 Months', '20 Years and 1 Months',
       '30 Years and 11 Months', '31 Years and 2 Months',
       '19 Years and 7 Months', '9 Years and 4 Months',
       '9 Years and 5 Months', '13 Years and 9 Months',
       '13 Years and 11 Months', '22 Years and 2 Months',
       '28 Years and 10 

In [None]:
df['Credit_History_Age'] = df['Credit_History_Age'].apply(lambda x: (int(re.search(r'(\d+)\s*[yY]ears?', x).group(1)) * 12) + int(re.search(r'(\d+)\s*[nM]onths?',x).group(1)))

In [543]:
df.rename({'Credit_History_Age':'Credit_History_Age(in months)'},inplace=True) # for better understanding renaming the column name

In [544]:
df['Total_EMI_per_month'] # in US Dollars

1         49.574949
5         18.816215
9        246.992319
12        16.415452
14        16.415452
            ...    
49966     28.182033
49976    362.072453
49992     60.964772
49993     60.964772
49997     35.104023
Name: Total_EMI_per_month, Length: 13318, dtype: float64

In [545]:
df=df[df['Total_EMI_per_month']>12]  # filtering out the relevant values

In [546]:
df=df[df['Payment_Behaviour']!='!@9#%8'] # removing the irrelevant values

In [547]:
df=df[df['Monthly_Balance']>0] # removing negative values

Ordinal Encoding for month

In [548]:
month=OrdinalEncoder(categories=[['September', 'October', 'November', 'December']])
df['Month']=month.fit_transform(df[['Month']])

Ordinal encoding for occupation

In [549]:
occupation=LabelEncoder()
df['Occupation']=occupation.fit_transform(df['Occupation'])

Ordinal Encoding in credit_mix

In [550]:
df['Credit_Mix'].unique()

array(['Good', 'Standard', 'Bad'], dtype=object)

In [551]:
credit_mix=OrdinalEncoder(categories=[['Bad','Standard','Good']])
df['Credit_Mix']=credit_mix.fit_transform(df[['Credit_Mix']])

Ordinal encoding in Payment_Behaviour

In [552]:
df['Payment_Behaviour'].unique()

array(['High_spent_Medium_value_payments',
       'Low_spent_Large_value_payments',
       'Low_spent_Medium_value_payments',
       'Low_spent_Small_value_payments',
       'High_spent_Large_value_payments',
       'High_spent_Small_value_payments'], dtype=object)

In [553]:
behavior=OrdinalEncoder(categories=[['Low_spent_Small_value_payments','Low_spent_Medium_value_payments','Low_spent_Large_value_payments','High_spent_Small_value_payments','High_spent_Medium_value_payments','High_spent_Large_value_payments']])
df['Payment_Behaviour']=behavior.fit_transform(df[['Payment_Behaviour']])

In [554]:
Payment_of_Min_Amount=LabelEncoder()
df['Payment_of_Min_Amount']=Payment_of_Min_Amount.fit_transform(df['Payment_of_Min_Amount'])

In [555]:
df['Interest_Rate'].dtype

dtype('int64')

Checking for negative or any outliers

In [556]:
for i in df:
  if df[i].dtype in [int,float]:
    print(f"Minimum value of {i}:",df[i].min())
    print(f'Maximum_value of {i}:',df[i].max())
    print()

Minimum value of Month: 0.0
Maximum_value of Month: 3.0

Minimum value of Age: 14.0
Maximum_value of Age: 95.0

Minimum value of Occupation: 0
Maximum_value of Occupation: 14

Minimum value of Annual_Income: 7005.93
Maximum_value of Annual_Income: 23994243.0

Minimum value of Monthly_Inhand_Salary: 303.6454166666666
Maximum_value of Monthly_Inhand_Salary: 15167.18

Minimum value of Num_Bank_Accounts: 1
Maximum_value of Num_Bank_Accounts: 10

Minimum value of Num_Credit_Card: 1
Maximum_value of Num_Credit_Card: 10

Minimum value of Interest_Rate: 1
Maximum_value of Interest_Rate: 34

Minimum value of Num_of_Loan: 1.0
Maximum_value of Num_of_Loan: 9.0

Minimum value of Delay_from_due_date: 1
Maximum_value of Delay_from_due_date: 67

Minimum value of Num_of_Delayed_Payment: 0.0
Maximum_value of Num_of_Delayed_Payment: 28.0

Minimum value of Changed_Credit_Limit: -6.4
Maximum_value of Changed_Credit_Limit: 35.81

Minimum value of Num_Credit_Inquiries: 1.0
Maximum_value of Num_Credit_Inquir

In [557]:
df.drop(index=df[df['Age']==95].index,inplace=True) 

Modeling

In [559]:
x=df.drop(columns=['Credit_Mix'])
y=df['Credit_Mix']

In [560]:
minmax=MinMaxScaler()
x_scaled=minmax.fit_transform(x)

In [561]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.3,random_state=42)

In [562]:
models=[KNeighborsClassifier(),SVC(),MultinomialNB(),DecisionTreeClassifier(random_state=42),RandomForestClassifier(random_state=42),GradientBoostingClassifier(random_state=42),AdaBoostClassifier(random_state=42),XGBClassifier(random_state=42)]
for model in models:
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(f'Train data-{str(model).split('(')[0]}:',model.score(x_train,y_train)*100)
  print(f'Test data-{str(model).split('(')[0]}:',accuracy_score(y_test,y_pred)*100)
  print()

Train data-KNeighborsClassifier: 93.44941956882255
Test data-KNeighborsClassifier: 90.74074074074075

Train data-SVC: 93.98246860933428
Test data-SVC: 92.92426755113323

Train data-MultinomialNB: 80.6562425965411
Test data-MultinomialNB: 81.53676064123825

Train data-DecisionTreeClassifier: 100.0
Test data-DecisionTreeClassifier: 93.06246545052515

Train data-RandomForestClassifier: 100.0
Test data-RandomForestClassifier: 95.6882255389718

Train data-GradientBoostingClassifier: 96.12651030561479
Test data-GradientBoostingClassifier: 94.47208402432284

Train data-AdaBoostClassifier: 84.9680170575693
Test data-AdaBoostClassifier: 84.5771144278607

Train data-XGBClassifier: 100.0
Test data-XGBClassifier: 96.04754007739082



feature selection using wrapper selection method

In [563]:
x=df.drop(columns='Credit_Mix')
y=df['Credit_Mix']

In [564]:
model=RandomForestClassifier(random_state=42)
rfe=RFE(model,n_features_to_select=11)
rfe.fit(x,y)

In [565]:
x.columns[rfe.get_support()]

Index(['Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt',
       'Credit_History_Age', 'Payment_of_Min_Amount'],
      dtype='object')

In [566]:
x=df[x.columns[rfe.get_support()]]

In [None]:
# bad -0
# standard -1
# good -2

# no -0
# yes -1

In [567]:
z=x.copy()
z['credit_mix']=y


In [593]:
z[(z['credit_mix']==2)]

Unnamed: 0,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Outstanding_Debt,Credit_History_Age,Payment_of_Min_Amount,credit_mix
1,19114.12,3,4,3,4.0,3,9.0,13.27,809.98,274,0,2.0
5,34847.84,2,4,6,1.0,3,3.0,5.42,605.03,328,0,2.0
9,143162.64,1,5,8,3.0,6,3.0,2.10,1303.01,222,0,2.0
12,30689.89,2,5,4,1.0,5,6.0,1.99,632.46,215,0,2.0
14,30689.89,2,5,4,1.0,5,6.0,1.99,632.46,217,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49951,41329.56,2,3,7,1.0,2,10.0,13.66,1245.01,290,0,2.0
49956,20059.98,8,5,12,4.0,25,11.0,2.97,909.01,203,0,2.0
49957,20059.98,8,5,12,4.0,25,14.0,2.97,909.01,204,0,2.0
49976,38321.39,4,4,3,4.0,11,7.0,1.59,678.57,211,0,2.0


In [568]:
minmax=MinMaxScaler()
x_scaled=minmax.fit_transform(x)

In [569]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.3,random_state=42)

In [570]:
models=[KNeighborsClassifier(),SVC(),MultinomialNB(),DecisionTreeClassifier(random_state=42),RandomForestClassifier(random_state=42),GradientBoostingClassifier(random_state=42),AdaBoostClassifier(random_state=42),XGBClassifier(random_state=42)]
for model in models:
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(f'Train data-{str(model).split('(')[0]}:',model.score(x_train,y_train)*100)
  print(f'Test data-{str(model).split('(')[0]}:',accuracy_score(y_test,y_pred)*100)
  print()

Train data-KNeighborsClassifier: 95.72376214167258
Test data-KNeighborsClassifier: 93.20066334991708

Train data-SVC: 93.63894811656006
Test data-SVC: 93.36650082918739

Train data-MultinomialNB: 69.08315565031982
Test data-MultinomialNB: 70.39800995024875

Train data-DecisionTreeClassifier: 100.0
Test data-DecisionTreeClassifier: 93.80873410724156

Train data-RandomForestClassifier: 100.0
Test data-RandomForestClassifier: 96.37921503593145

Train data-GradientBoostingClassifier: 96.00805496327884
Test data-GradientBoostingClassifier: 94.7208402432283

Train data-AdaBoostClassifier: 84.9680170575693
Test data-AdaBoostClassifier: 84.5771144278607

Train data-XGBClassifier: 100.0
Test data-XGBClassifier: 96.18573797678275



Hyper parameter tuning in RandomForestClassifier

In [571]:
forest=RandomForestClassifier(random_state=42)
forest.fit(x_train,y_train)
y_pred=forest.predict(x_test)
accuracy_score(y_test,y_pred)*100

96.37921503593145

In [572]:
forest.score(x_train,y_train)*100

100.0

In [573]:
# params={'n_estimators':[50,100,150],'criterion':['gini','entropy','log_loss'],'max_depth':[7,8,9],'max_features':['sqrt','log2'],'random_state':[23,42]}
# clf=GridSearchCV(forest,params,cv=9,scoring='accuracy')
# clf.fit(x_train,y_train)

In [574]:
# y_pred=clf.predict(x_test)
# print("Accuracy_score:",accuracy_score(y_test,y_pred)*100)

pickling

In [575]:
dict1={'model':forest,'output':credit_mix,'Payment_of_Min_Amount':Payment_of_Min_Amount,'scaler':minmax}
with open('credit.pkl','wb') as obj1:
  pickle.dump(dict1,obj1)

In [576]:
y

1        2.0
5        2.0
9        2.0
12       2.0
14       2.0
        ... 
49966    0.0
49976    2.0
49992    0.0
49993    0.0
49997    2.0
Name: Credit_Mix, Length: 12060, dtype: float64

In [577]:
y

1        2.0
5        2.0
9        2.0
12       2.0
14       2.0
        ... 
49966    0.0
49976    2.0
49992    0.0
49993    0.0
49997    2.0
Name: Credit_Mix, Length: 12060, dtype: float64

In [578]:
x

Unnamed: 0,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Outstanding_Debt,Credit_History_Age,Payment_of_Min_Amount
1,19114.12,3,4,3,4.0,3,9.0,13.27,809.98,274,0
5,34847.84,2,4,6,1.0,3,3.0,5.42,605.03,328,0
9,143162.64,1,5,8,3.0,6,3.0,2.10,1303.01,222,0
12,30689.89,2,5,4,1.0,5,6.0,1.99,632.46,215,0
14,30689.89,2,5,4,1.0,5,6.0,1.99,632.46,217,0
...,...,...,...,...,...,...,...,...,...,...,...
49966,15319.65,6,7,15,4.0,53,16.0,17.13,1453.61,139,1
49976,38321.39,4,4,3,4.0,11,7.0,1.59,678.57,211,0
49992,20002.88,10,8,29,5.0,33,25.0,18.31,3571.70,76,1
49993,20002.88,10,8,29,5.0,33,25.0,18.31,3571.70,77,1


In [579]:
y

1        2.0
5        2.0
9        2.0
12       2.0
14       2.0
        ... 
49966    0.0
49976    2.0
49992    0.0
49993    0.0
49997    2.0
Name: Credit_Mix, Length: 12060, dtype: float64