Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import os
%matplotlib inline 

from os import chdir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Introduction

Data Import

In [None]:
# change the path to where you have downloaded the data set

#path = r"C:\Users\RAJI_\Documents\CS\Year 3\Data Mining (Maths)\Assignment2"
path = os.getcwd()

chdir(path)
data = pd.read_csv('CreditCard_train.csv', header =1)

Data Transformation and Exploration

In [None]:
# checking we have the correct data columns and glancing over the data
data.describe()

In [None]:
# Let's do a little data visualisation
cols = [ f for f in data.columns if data.dtypes[ f ] != "object" and f!= 'ID']
del cols[-1]
print(cols)

# We plot some distibution plot
f = pd.melt(data, id_vars='default payment next month', value_vars= cols)
g = sns.FacetGrid( f, hue='default payment next month', col="variable", col_wrap=5, sharex=False, sharey=False)
g = g.map( sns.distplot, "value", kde=True).add_legend()

In [None]:
corr = data.corr()

from seaborn import heatmap
heatmap(corr)

plt.show()

In [None]:
qual_Enc = cols
qual_Enc.remove("LIMIT_BAL")
qual_Enc.remove("AGE")

logged = []
for ii in range(1,7):
    qual_Enc.remove("PAY_AMT" + str( ii ))
    data[ "log_PAY_AMT" + str( ii )]  = data["PAY_AMT"  + str( ii )].apply( lambda x: np.log1p(x) if (x>0) else 0 )
    logged.append("log_PAY_AMT" + str( ii ) )

for ii in range(1,7):
    qual_Enc.remove("BILL_AMT" + str( ii ))
    data[ "log_BILL_AMT" + str( ii )] = data["BILL_AMT" + str( ii )].apply( lambda x: np.log1p(x) if (x>0) else 0 )
    logged.append("log_BILL_AMT" + str( ii ) )

f = pd.melt( data, id_vars='default payment next month', value_vars=logged)
g = sns.FacetGrid( f, hue='default payment next month', col="variable", col_wrap=3, sharex=False, sharey=False )
g = g.map( sns.distplot, "value", kde=True).add_legend()

In [None]:
temp = data["default payment next month"].value_counts()
df = pd.DataFrame({'default payment next month': temp.index,'values': temp.values})
plt.figure(figsize = (6,6))
plt.title('Default Credit Card Clients - target value - data unbalance\n (Default = 0, Not Default = 1)')
sns.set_color_codes("pastel")
sns.barplot(x = 'default payment next month', y="values", data=df)
locs, labels = plt.xticks()
plt.show()

In [None]:
plt.figure(figsize = (6,6))
plt.title('Default Credit Card Clients \n (Default = 0, Not Default = 1)')
ax = sns.countplot(x="default payment next month", data=data)

In [None]:
plt.figure(figsize = (15,10))
i = 0
for c in cols[3:]:
    i= i+1
    plt.subplot(3,3,i)
    ax = sns.countplot(x=c, data=data)
    plt.tight_layout() 

From the countplots we can observe that there are two unknown variable for repayment status:-2 and 0. Technically we should consider these variables as "NA" (Not Applicable). There are several methods to deal with this situation:

1.Remove observations
We can delete this data and ignore them but considering the vast amount of these observations, the loss of data would be too great.
2.Replace NAs with others

For now I think the best idea is to keep those observations as "NAs" and see how does it affect the predicted output.


In [None]:
#Social Status(SEX, EDUCATION, MARRIAGE)

figs, axs = plt.subplots(1, 3, figsize=(15, 10))
axs = axs.ravel()

for counter in range(3):
    col = data.columns[2+counter]
    sns.set(style="darkgrid")
    sns.countplot(x=str(col), data=data,ax=axs[counter])
plt.show()


We summerise the sex, marriage and eduation as the social status factor. Again we notice that there are obervations with several undocumented values. For example, 0,5 and 6 for education. As I stated before, we could classify those as NA and  keep them as it is for now.

In [None]:
#Age counte
plt.figure(figsize=(12,8))
col = data.columns[5]
sns.set(style="darkgrid")
ax=sns.countplot(x=str(col), data=data)
plt.show()

In [None]:
# #Age Scatter
ax2 = data.plot.scatter(x='ID', y='AGE',c='AGE', cmap='viridis')
plt.show()

Looking at the scatterplot, the strange thing is that with index more than 15000, the structure of the data seems like to have some kind of periodic patterns therefore we have drawed another histograms to analyse the strcture of the data.

In [None]:
#comparing 2 groups
figs, axs = plt.subplots(1, 2, figsize=(12, 10))
#before 15000
col = data.columns[5]
distinct=len(data[col].unique())
axs[0].hist(data[col][:15000], bins=distinct,alpha=0.7)
axs[0].grid(axis='y',alpha=0.75)
axs[0].set_title("Before 15000 ")

#after 15000
col = data.columns[5]
distinct=len(data[col].unique())
axs[1].hist(data[col][15000:], bins=distinct,alpha=0.7)
axs[1].grid(axis='y',alpha=0.75)
axs[1].set_title("After 15000 ")

plt.show()

In [None]:
sns.boxplot(x=data[col][:15000])set_title("Before 15000 ")


In [None]:
sns.boxplot(x=data[col][15000:]).set_title("After 15000 ")

 Both parts of the data hava the same structure and density, therefore the second part of the data is valid 

In [None]:
figs, axs = plt.subplots(3, 2, figsize=(12, 10))
axs = axs.ravel()

for counter in range(6):
    col = data.columns[11 + counter]
    distinct=len(data[col].unique())
    sns.set(style="darkgrid")
    sns.countplot(x=str(col), data=data, ax=axs[counter])
plt.show()

Methodology Overview

Model training/validation

Results

Final predictions on test set