In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data preprocessing - training data

In [34]:
df = pd.read_csv("cs-training.csv",index_col = False)#even after passing index_col = False it doesn't remove index

In [35]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [36]:
df.columns

Index(['Unnamed: 0', 'SeriousDlqin2yrs',
       'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [37]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [38]:
df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [39]:
# SeriousDlqin2yrs - Person experienced 90 days past due delinquency or worse (DL_90+)
# RevolvingUtilizationOfUnsecuredLines - Total balance on credit cards and personal lines of credit except 
#                                        real estate and no installment debt like car loans divided by the 
#                                        sum of credit limits(creditbalance/limits)
# age - Age of borrower in years(age)
# NumberOfTime30-59DaysPastDueNotWorse - Number of times borrower has been 30-59 days past due but no worse in the 
#                                        last 2 years (no.pastdue_30-59)
# DebtRatio - Monthly debt payments, alimony,living costs divided by monthy gross income (DebtRatio)
# MonthlyIncome - Monthly income (MonthlyIncome)
# NumberOfOpenCreditLinesAndLoans - Number of Open loans (installment like car loan or mortgage) 
#                                   and Lines of credit (e.g. credit cards) (no.of openloans)
# NumberOfTimes90DaysLate - Number of times borrower has been 90 days or more past due. (no.pasdue_90+)
# NumberRealEstateLoansOrLines - Number of mortgage and real estate loans including home equity lines of credit
#                                (no.of mortgage)
# NumberOfTime60-89DaysPastDueNotWorse - Number of times borrower has been 60-89 days past due but 
#                                        no worse in the last 2 years. (no.pasdue_60-89)
# NumberOfDependents - Number of dependents in family excluding themselves (spouse, children etc.) (no.dependents)

In [40]:
df.rename(columns = {'SeriousDlqin2yrs':'DL_90', 'RevolvingUtilizationOfUnsecuredLines':'creditbalance/limits',
       'NumberOfTime30-59DaysPastDueNotWorse':'no.pastdue_30-59', 
       'NumberOfOpenCreditLinesAndLoans':'no.of openloans', 'NumberOfTimes90DaysLate':'no.pasdue_90+',
       'NumberRealEstateLoansOrLines':'no.of mortgage', 'NumberOfTime60-89DaysPastDueNotWorse':'no.pasdue_60-89',
       'NumberOfDependents':'no.dependents'}, inplace = True)

In [41]:
df.head()

Unnamed: 0,DL_90,creditbalance/limits,age,no.pastdue_30-59,DebtRatio,MonthlyIncome,no.of openloans,no.pasdue_90+,no.of mortgage,no.pasdue_60-89,no.dependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [42]:
#it is a classification problem. Using all these features we have to predict the probability that
#somebody will experience financial distress in the next two years to decide whether or not a loan should be granted.
# will experience and won't experience are the two classes

In [43]:
df.shape

(150000, 11)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   DL_90                 150000 non-null  int64  
 1   creditbalance/limits  150000 non-null  float64
 2   age                   150000 non-null  int64  
 3   no.pastdue_30-59      150000 non-null  int64  
 4   DebtRatio             150000 non-null  float64
 5   MonthlyIncome         120269 non-null  float64
 6   no.of openloans       150000 non-null  int64  
 7   no.pasdue_90+         150000 non-null  int64  
 8   no.of mortgage        150000 non-null  int64  
 9   no.pasdue_60-89       150000 non-null  int64  
 10  no.dependents         146076 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 12.6 MB


## Taking care of missing values

In [45]:
df.isnull().sum()

DL_90                       0
creditbalance/limits        0
age                         0
no.pastdue_30-59            0
DebtRatio                   0
MonthlyIncome           29731
no.of openloans             0
no.pasdue_90+               0
no.of mortgage              0
no.pasdue_60-89             0
no.dependents            3924
dtype: int64

In [46]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy="mean")
imputer.fit(df.loc[:,['MonthlyIncome','no.dependents']])
df.loc[:,['MonthlyIncome','no.dependents']] = imputer.transform(df.loc[:,['MonthlyIncome','no.dependents']])

In [47]:
df.isnull().sum()

DL_90                   0
creditbalance/limits    0
age                     0
no.pastdue_30-59        0
DebtRatio               0
MonthlyIncome           0
no.of openloans         0
no.pasdue_90+           0
no.of mortgage          0
no.pasdue_60-89         0
no.dependents           0
dtype: int64

In [48]:
x_train = df.iloc[:,1:11]

In [49]:
x_train.head()

Unnamed: 0,creditbalance/limits,age,no.pastdue_30-59,DebtRatio,MonthlyIncome,no.of openloans,no.pasdue_90+,no.of mortgage,no.pasdue_60-89,no.dependents
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [50]:
y_train = df.iloc[:,0]

In [51]:
y_train.head()

0    1
1    0
2    0
3    0
4    0
Name: DL_90, dtype: int64

In [52]:
df.corr()

Unnamed: 0,DL_90,creditbalance/limits,age,no.pastdue_30-59,DebtRatio,MonthlyIncome,no.of openloans,no.pasdue_90+,no.of mortgage,no.pasdue_60-89,no.dependents
DL_90,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.018002,-0.029669,0.117175,-0.007038,0.102261,0.045621
creditbalance/limits,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.006565,-0.011281,-0.001061,0.006235,-0.001048,0.001539
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.032984,0.147705,-0.061005,0.03315,-0.057159,-0.208102
no.pastdue_30-59,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.007636,-0.055312,0.983603,-0.030565,0.987005,-0.002525
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.005355,0.049565,-0.00832,0.120046,-0.007533,-0.038287
MonthlyIncome,-0.018002,0.006565,0.032984,-0.007636,-0.005355,1.0,0.082319,-0.009484,0.113823,-0.008259,0.058542
no.of openloans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.082319,1.0,-0.079984,0.433959,-0.071077,0.064507
no.pasdue_90+,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.009484,-0.079984,1.0,-0.045205,0.992796,-0.009579
no.of mortgage,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.113823,0.433959,-0.045205,1.0,-0.039722,0.12337
no.pasdue_60-89,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.008259,-0.071077,0.992796,-0.039722,1.0,-0.010277


## Data preprocessing - test data

In [21]:
df_2 = pd.read_csv("cs-test.csv",index_col = False)

In [22]:
df_2.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


In [23]:
df_2.columns

Index(['Unnamed: 0', 'SeriousDlqin2yrs',
       'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [24]:
df_2 = df_2.drop(['Unnamed: 0'], axis = 1)

In [25]:
df_2.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [25]:
df_2.rename(columns = {'SeriousDlqin2yrs':'DL_90+', 'RevolvingUtilizationOfUnsecuredLines':'creditbalance/limits',
       'NumberOfTime30-59DaysPastDueNotWorse':'no.pastdue_30-59', 
       'NumberOfOpenCreditLinesAndLoans':'no.of openloans', 'NumberOfTimes90DaysLate':'no.pasdue_90+',
       'NumberRealEstateLoansOrLines':'no.of mortgage', 'NumberOfTime60-89DaysPastDueNotWorse':'no.pasdue_60-89',
       'NumberOfDependents':'no.dependents'}, inplace = True)

In [26]:
df_2.head()

Unnamed: 0,DL_90+,creditbalance/limits,age,no.pastdue_30-59,DebtRatio,MonthlyIncome,no.of openloans,no.pasdue_90+,no.of mortgage,no.pasdue_60-89,no.dependents
0,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


In [27]:
df_2.shape

(101503, 11)

## Taking care of missing values

In [28]:
df_2.isnull().sum()

DL_90+                  101503
creditbalance/limits         0
age                          0
no.pastdue_30-59             0
DebtRatio                    0
MonthlyIncome            20103
no.of openloans              0
no.pasdue_90+                0
no.of mortgage               0
no.pasdue_60-89              0
no.dependents             2626
dtype: int64

In [29]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy="mean")
imputer.fit(df_2.loc[:,['MonthlyIncome','no.dependents']])
df_2.loc[:,['MonthlyIncome','no.dependents']] = imputer.transform(df_2.loc[:,['MonthlyIncome','no.dependents']])

In [30]:
df_2.shape

(101503, 11)

In [31]:
df_2.isnull().sum()

DL_90+                  101503
creditbalance/limits         0
age                          0
no.pastdue_30-59             0
DebtRatio                    0
MonthlyIncome                0
no.of openloans              0
no.pasdue_90+                0
no.of mortgage               0
no.pasdue_60-89              0
no.dependents                0
dtype: int64

In [32]:
x_test = df_2.iloc[:,1:11]

In [33]:
x_test.head()

Unnamed: 0,creditbalance/limits,age,no.pastdue_30-59,DebtRatio,MonthlyIncome,no.of openloans,no.pasdue_90+,no.of mortgage,no.pasdue_60-89,no.dependents
0,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


In [34]:
y_test = df_2.iloc[:,0]

In [35]:
y_test.head()

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: DL_90+, dtype: float64

## Scaling the values

In [36]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [37]:
print(x_train)

[[-0.02115001 -0.49385982  0.37659296 ...  4.40954554 -0.05785249
   1.12938692]
 [-0.02038516 -0.83234222 -0.10041896 ... -0.90128301 -0.05785249
   0.22062674]
 [-0.02158222 -0.96773518  0.138087   ... -0.90128301 -0.05785249
  -0.68813345]
 ...
 [-0.02323239  0.38619443 -0.10041896 ... -0.01614492 -0.05785249
  -0.68813345]
 [-0.02421753 -1.50930703 -0.10041896 ... -0.90128301 -0.05785249
  -0.68813345]
 [-0.02081306  0.79237332 -0.10041896 ...  0.86899317 -0.05785249
  -0.68813345]]


In [38]:
print(x_test)

[[-0.02067197 -0.62925278 -0.10041896 ... -0.90128301 -0.05785249
  -0.68813345]
 [-0.02236253  0.31849795 -0.10041896 ...  2.63926936 -0.05785249
   1.12938692]
 [-0.02404426  0.45389091 -0.10041896 ... -0.01614492 -0.05785249
   1.12938692]
 ...
 [-0.02389082  1.1985522  -0.10041896 ... -0.90128301 -0.05785249
   0.01074529]
 [-0.02287439  0.25080147 -0.10041896 ...  0.86899317  0.18281181
   2.03814711]
 [-0.02244843 -1.57700351 -0.10041896 ... -0.90128301 -0.05785249
  -0.68813345]]


## Training the Random Forest Classification model on the Training set

In [39]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

## Predicting new result

In [40]:
#df.iloc[1:2,:]
print(classifier.predict(sc.transform([[0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0]])))

[0]


  "X does not have valid feature names, but"


## Predicting the test set result

In [41]:
y_pred = classifier.predict(x_test)

In [42]:
y_pred.reshape(len(y_pred),1)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [43]:
y_pred = pd.DataFrame(y_pred)

In [44]:
x_test = pd.DataFrame(x_test)

In [45]:
result = pd.concat([y_pred,x_test],axis = 1)

In [46]:
result.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,9
0,0,-0.020672,-0.629253,-0.100419,-0.17314,-0.075325,-0.865297,-0.063793,-0.901283,-0.057852,-0.688133
1,0,-0.022363,0.318498,-0.100419,-0.172969,0.191825,1.272313,-0.063793,2.639269,-0.057852,1.129387
2,0,-0.024044,0.453891,-0.100419,-0.17289,-0.123228,0.689329,-0.063793,-0.016145,-0.057852,1.129387
3,0,-0.023095,-0.967735,0.138087,-0.172773,-0.269419,-0.282312,-0.063793,0.868993,-0.057852,-0.688133
4,0,-0.020214,-1.712396,-0.100419,-0.173218,-0.21779,-0.865297,-0.063793,-0.901283,-0.057852,0.220627
