In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv(r'C:\Users\abida\Documents\GitHub\Credit_Risk--Probability_of_default\cs-training.csv').drop(['Unnamed: 0'],axis=1)
test = pd.read_csv(r'C:\Users\abida\Documents\GitHub\Credit_Risk--Probability_of_default\cs-test.csv').drop(['Unnamed: 0'],axis=1)

# 1- Data Exploration:

In [3]:
train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
# .duplicated returns True/False if the entire row is duplicated
train.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
149995    False
149996    False
149997    False
149998    False
149999    False
Length: 150000, dtype: bool

In [5]:
#to see how many duplicates we will simply use value_counts().
train.duplicated().value_counts()

False    149391
True        609
Name: count, dtype: int64

we have 609 records that are duplicates. i will just delete them as initially i'm looking for my model to train on unique cases.

In [6]:
train_redup = train.drop_duplicates()

In [7]:
#as you can see we no longer have duplicates
train_redup.duplicated().value_counts()

False    149391
Name: count, dtype: int64

I will first take a look on what does the training data look like

In [8]:
train_redup.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


let's take a look on the shape of the training data as well as the test data

In [9]:
train_redup.shape


(149391, 11)

In [10]:
test.shape

(101503, 11)

let's see if we have null values.

In [11]:
train_redup.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29221
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3828
dtype: int64

In [12]:
test.isnull().sum()

SeriousDlqin2yrs                        101503
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
NumberOfTime30-59DaysPastDueNotWorse         0
DebtRatio                                    0
MonthlyIncome                            20103
NumberOfOpenCreditLinesAndLoans              0
NumberOfTimes90DaysLate                      0
NumberRealEstateLoansOrLines                 0
NumberOfTime60-89DaysPastDueNotWorse         0
NumberOfDependents                        2626
dtype: int64

first we will look into the null values of number of Dependents and see what we can do about it.

In [13]:
train_redup[train_redup.NumberOfDependents.isnull()]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
8,0,0.116951,27,0,46.0,,2,0,0,0,
96,0,0.542243,48,2,10.0,,2,0,0,0,
109,0,0.041258,61,0,4739.0,,11,0,4,0,
159,0,0.000000,63,0,2.0,,4,0,0,0,
238,0,1.000000,28,0,0.0,,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
149826,0,0.027319,71,0,1419.0,,13,0,1,0,
149854,0,0.056589,75,0,8.0,,3,0,0,0,
149894,0,0.017904,55,0,1058.0,,8,0,1,1,
149948,0,0.055530,27,0,10.0,,1,0,0,0,


In [14]:
train_redup[train_redup.NumberOfDependents==0]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
6,0,0.305682,57,0,5710.000000,,8,0,3,0,0.0
7,0,0.754464,39,0,0.209940,3500.0,8,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149994,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149997,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


As you can see my first instinct was to see if that just meant that the borrower had no dependents that was wrong in that case we wouldn't have had any records where the number of dependents is equal to 0. so in this case i will try to find the best way to replace the missing values to get the best results.

In [16]:
train_redup[train_redup.NumberOfDependents.isnull()].describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,3828.0,3828.0,3828.0,3828.0,3828.0,0.0,3828.0,3828.0,3828.0,3828.0,0.0
mean,0.046499,11.003369,59.741641,0.572623,1110.713689,,5.708986,0.497126,0.605799,0.474138,
std,0.210592,240.656436,18.345175,6.52732,4235.410634,,4.086337,6.523199,0.92099,6.520343,
min,0.0,0.0,21.0,0.0,0.0,,0.0,0.0,0.0,0.0,
25%,0.0,0.009228,48.0,0.0,25.0,,3.0,0.0,0.0,0.0,
50%,0.0,0.04767,61.0,0.0,398.0,,5.0,0.0,0.0,0.0,
75%,0.0,0.259028,74.0,0.0,1587.0,,8.0,0.0,1.0,0.0,
max,1.0,10821.0,109.0,98.0,220516.0,,30.0,98.0,15.0,98.0,


In [17]:
train_redup.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,149391.0,149391.0,149391.0,149391.0,149391.0,120170.0,149391.0,149391.0,149391.0,149391.0,145563.0
mean,0.066999,6.071087,52.306237,0.393886,354.43674,6675.098,8.480892,0.23812,1.022391,0.212503,0.759863
std,0.250021,250.263672,14.725962,3.852953,2041.843455,14389.58,5.136515,3.826165,1.130196,3.810523,1.116141
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.030132,41.0,0.0,0.177441,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154235,52.0,0.0,0.368234,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.556494,63.0,0.0,0.875279,8250.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


as you can see records with missing values of Number of dependents have less people who experienced 90 days past due delinquency or worse.

In [20]:
train_redup["NumberOfDependents"].agg(['mode'])

Unnamed: 0,mode
0,0.0


In [21]:
train_redup.groupby(['NumberOfDependents']).size()

NumberOfDependents
0.0     86392
1.0     26314
2.0     19521
3.0      9483
4.0      2862
5.0       746
6.0       158
7.0        51
8.0        24
9.0         5
10.0        5
13.0        1
20.0        1
dtype: int64

most of the past customers do not have any dependents.

so we will fill the missing values of this variable(NumberOfDependents) with 0 and see what's that gonna look like.

In [26]:
fam_miss = train_redup[train_redup.NumberOfDependents.isnull()]
fam_nmiss = train_redup[train_redup.NumberOfDependents.notnull()]

In [27]:
fam_miss['NumberOfDependents'] = fam_miss['NumberOfDependents'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fam_miss['NumberOfDependents'] = fam_miss['NumberOfDependents'].fillna(0)


In [30]:
fam_miss.isnull().sum()

SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                           3828
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                         0
dtype: int64

----------------------------

Same with the monthly income.


In [23]:
train_redup[train_redup.MonthlyIncome.isnull()]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
6,0,0.305682,57,0,5710.0,,8,0,3,0,0.0
8,0,0.116951,27,0,46.0,,2,0,0,0,
16,0,0.061086,78,0,2058.0,,10,0,2,0,0.0
32,0,0.083418,62,0,977.0,,6,0,1,0,0.0
41,0,0.072898,81,0,75.0,,7,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149976,0,0.000627,76,0,60.0,,5,0,0,0,0.0
149977,0,0.236450,29,0,349.0,,3,0,0,0,0.0
149984,0,0.037548,84,0,25.0,,5,0,0,0,0.0
149992,0,0.871976,50,0,4132.0,,11,0,1,0,3.0


In [24]:
train_redup[train_redup.MonthlyIncome==0]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
14,0,0.019657,76,0,477.0,0.0,6,0,1,0,0.0
50,0,0.818978,73,0,3095.0,0.0,9,0,1,1,0.0
73,0,0.059669,31,0,3162.0,0.0,11,0,2,0,1.0
90,0,0.039388,51,0,15466.0,0.0,7,0,0,0,0.0
298,0,0.085152,25,0,1005.0,0.0,5,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149383,0,0.239302,63,0,3153.0,0.0,29,0,1,0,0.0
149562,0,0.230799,72,0,430.0,0.0,7,0,0,0,0.0
149646,0,0.080084,69,0,1248.0,0.0,8,0,1,0,1.0
149673,0,0.132362,82,0,4123.0,0.0,14,0,2,0,0.0


In [25]:
train_redup[train_redup.MonthlyIncome==0].describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0,1616.0
mean,0.040223,4.336369,48.620668,0.421411,1591.094678,0.0,7.131188,0.309406,0.724629,0.287129,0.738861
std,0.196542,147.433057,16.558826,4.896579,2828.748394,0.0,4.831283,4.890459,0.905593,4.881175,1.190728
min,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.020273,36.0,0.0,110.5,0.0,3.0,0.0,0.0,0.0,0.0
50%,0.0,0.107736,47.0,0.0,946.0,0.0,6.0,0.0,0.0,0.0,0.0
75%,0.0,0.472699,62.0,0.0,2204.0,0.0,10.0,0.0,1.0,0.0,1.0
max,1.0,5893.0,97.0,98.0,60212.0,0.0,31.0,98.0,9.0,98.0,10.0


In [32]:
fam_miss[fam_miss.MonthlyIncome.isnull()].describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,3828.0,3828.0,3828.0,3828.0,3828.0,0.0,3828.0,3828.0,3828.0,3828.0,3828.0
mean,0.046499,11.003369,59.741641,0.572623,1110.713689,,5.708986,0.497126,0.605799,0.474138,0.0
std,0.210592,240.656436,18.345175,6.52732,4235.410634,,4.086337,6.523199,0.92099,6.520343,0.0
min,0.0,0.0,21.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.009228,48.0,0.0,25.0,,3.0,0.0,0.0,0.0,0.0
50%,0.0,0.04767,61.0,0.0,398.0,,5.0,0.0,0.0,0.0,0.0
75%,0.0,0.259028,74.0,0.0,1587.0,,8.0,0.0,1.0,0.0,0.0
max,1.0,10821.0,109.0,98.0,220516.0,,30.0,98.0,15.0,98.0,0.0


In [34]:
fam_miss['MonthlyIncome'].agg(['mean','median','min','max'])

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


mean     NaN
median   NaN
min      NaN
max      NaN
Name: MonthlyIncome, dtype: float64

### _Hypothesis 1:_
Let's assume the worst case possible for people who have missing values of monthly income is because they want to hide the fact that it's 0 (for a better deal maybe? idk)