### Notebook for Handling Missing Values ###

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv("../Default_of_Credit_Card_Clients.csv", index_col=0, header=0)
data = data.iloc[1:,:]
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [2]:
data = data.astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
X1     30000 non-null int64
X2     30000 non-null int64
X3     30000 non-null int64
X4     30000 non-null int64
X5     30000 non-null int64
X6     30000 non-null int64
X7     30000 non-null int64
X8     30000 non-null int64
X9     30000 non-null int64
X10    30000 non-null int64
X11    30000 non-null int64
X12    30000 non-null int64
X13    30000 non-null int64
X14    30000 non-null int64
X15    30000 non-null int64
X16    30000 non-null int64
X17    30000 non-null int64
X18    30000 non-null int64
X19    30000 non-null int64
X20    30000 non-null int64
X21    30000 non-null int64
X22    30000 non-null int64
X23    30000 non-null int64
Y      30000 non-null int64
dtypes: int64(24)
memory usage: 5.7+ MB


In [3]:
### Finding unique values in columns
print(data.X2.unique())
print(data.X3.unique())
print(data.X4.unique())
print(data.X6.unique())
print(data.X7.unique())
print(data.X8.unique())
print(data.X9.unique())
print(data.X10.unique())
print(data.X11.unique())

[2 1]
[2 1 3 5 4 6 0]
[1 2 3 0]
[ 2 -1  0 -2  1  3  4  8  7  5  6]
[ 2  0 -1 -2  3  5  7  4  1  6  8]
[-1  0  2 -2  3  4  6  7  1  5  8]
[-1  0 -2  2  3  4  5  7  6  1  8]
[-2  0 -1  2  3  5  4  7  8  6]
[-2  2  0 -1  3  6  4  7  8  5]


- __It seems that there are values of 0 and -2 for some columns, and -2 is a missing value, as well as 0. 0 does appear a large amount of times for variables `X6` through `X11`__

In [4]:
### Imputing missing values, which are `0` and `-2`
data.X3 = data.X3.replace(0, data.X3.median())
data.X3 = data.X3.replace(5, data.X3.median())
data.X3 = data.X3.replace(6, data.X3.median())
data.X4 = data.X4.replace(0, data.X4.median())
data.X6 = data.X6.replace(0, data.X6.median())
data.X6 = data.X6.replace(-2, data.X6.median())
data.X7 = data.X7.replace(0, data.X7.median())
data.X7 = data.X7.replace(-2, data.X7.median())
data.X8 = data.X8.replace(0, data.X8.median())
data.X8 = data.X8.replace(-2, data.X8.median())
data.X9 = data.X9.replace(0, data.X9.median())
data.X9 = data.X9.replace(-2, data.X8.median())
data.X10 = data.X10.replace(0, data.X10.median())
data.X10 = data.X10.replace(-2, data.X10.median())
data.X11 = data.X11.replace(0, data.X11.median())
data.X11 = data.X11.replace(-2, data.X11.median())

In [5]:
# Rechecking the data
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
1,20000,2,2,1,24,2,2,-1,-1,0,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [18]:
## Exporting the dataframe after imputing the missing values

data.to_csv("Good.csv",",")