## Lab 1 Exploring datasets

In this lab, you will be working with exploring a dataset, visualizing it with different methods.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option("display.max_columns",100)
import warnings
warnings.simplefilter('ignore')

## Data overview

This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005. 

- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit)
- SEX: Gender (1=male, 2=female)
- EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others)
- MARRIAGE: Marital status (1=married, 2=single, 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two - months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- default.payment.next.month: Default payment in June, 2005 (1=yes, 0=no)

### Load data

In [None]:
filepath = './default_of_credit_card_clients.xls'
data = pd.read_excel(filepath,header=1)
data.columns = data.columns.str.lower()

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.rename(columns={'default payment next month':'default'}, inplace=True)
data.info()

<div class='alert alert-block alert-warning' style='font-size:150%'>
    
### Bar plot

</div>

In [None]:
g_default = data.groupby('default').size().reset_index()
g_default.columns = ['default','count']
print(g_default)

fig , ax = plt.subplots()
ax.bar(g_default['default'],g_default['count'],width=0.7,color=['g','b'],alpha=0.5)
ax.set_title('Distribution of default payment')
ax.set_xticks([0,1])
ax.set_xlabel('Default payment')
ax.set_ylabel('count')
for i, v in enumerate(g_default['count']):
    ax.text( i -.08,v/2, str(v), color='black', fontweight='bold')

In [None]:
ax = sns.countplot(x='default',data=data,alpha=0.5)
for i, v in enumerate(g_default['count']):
    ax.text( i -.07,v/2, str(v), color='black', fontweight='bold')

In [None]:
ax = sns.countplot(x='sex',hue='education',data=data)

<div class='alert alert-block alert-success' style="font-weight:bolder">

### Task 1

### 1a - Bar plot count of each education level per class. 
    
### 1b - Bar plot count of each marriage level per class.

</div>

In [None]:
# please implement task 1 here

ax = sns.countplot(x='default',hue='education',data=data)


<div class='alert alert-block alert-warning' style='font-size:150%'>
    
### Histogram

</div>

### Histogram of age with matplotlib and seaborn

In [None]:
# Histogram of age with matplotlib

fig,ax = plt.subplots()
n, bins, patches = plt.hist(data['age'],bins=30,facecolor='w',edgecolor='b',alpha=0.7,density=1)

In [None]:
# Histogram of age with seaborn

sns.distplot(data['age'],bins=30,hist_kws={"edgecolor": 'b',"facecolor":'w'})

In [None]:
# Histogram of age for each class on same figure

fig,ax = plt.subplots()
sns.distplot(data[data['default']==0]['age'],bins=30,hist_kws={"facecolor":'y'})
sns.distplot(data[data['default']==1]['age'],bins=30,hist_kws={"facecolor":'r'})

In [None]:
# Histogram of age for each class on separate axes

fig, axs = plt.subplots(nrows=1,ncols=2)
sns.distplot(data[data['default']==0]['age'],bins=30,hist_kws={"facecolor":'y'},ax=axs[0])
sns.distplot(data[data['default']==1]['age'],bins=30,hist_kws={"facecolor":'r'},ax=axs[1])

<div class='alert alert-block alert-success' style="font-weight:bolder">

### Task 2

### Plot the histogram of education for each class on separate axes

Hint: There are 7 categoricatl levels for education. Use a subplots of size (2,4) and inside a for loop plot the histograms.

</div>

In [None]:
# please implement task 2 here

fig, axs = plt.subplots(nrows=2,ncols=4)
for r in range(7):
    sns.distplot(data[data['education']==r]['age'],bins=30,hist_kws={"facecolor":'y'},ax=axs[r//4][r%4])



<div class='alert alert-block alert-warning' style='font-size:150%'>

### Scatter Plot



</div>

### Scatter plot with matplotlib and seaborn

In [None]:
# Scatter plot of "bill_amt6" vs "pay_amt5 with Matplotlib

fig, ax = plt.subplots()
ax.scatter(data['bill_amt6'],data['pay_amt5'],c=data['default'],alpha=0.5)
ax.set_xlabel('bill_amt6 (April)')
ax.tick_params(axis='x',rotation=45)

In [None]:
# Scatter plot of "bill_amt6" vs "pay_amt5 with Seaborn
sns.scatterplot(x='bill_amt6',y='pay_amt5',hue=data.default.tolist(),data=data)

<div class='alert alert-block alert-success' style="font-weight:bolder">

### Task 3
    
### Plot the scatter for bill_amt and pay_amt for other months, within a for loop


</div>

In [None]:
# please implement task 3 here

# What bill vs what pay? here's every permutation
fig, axs = plt.subplots(nrows=6,ncols=6, figsize=(36,36))
for r in range(36):
    sns.scatterplot(x=f'bill_amt{1+r//6}',y=f'pay_amt{1+r%6}',hue=data.default.tolist(),data=data,ax=axs[r//6][r%6])

<div class='alert alert-block alert-warning' style='font-size:150%'>

### Pair Plot


</div>

### Using bar plot to illusterate the number of default payment in each class.

In [None]:
bill_cols = [col for col in data.columns if 'bill' in col]

In [None]:
sns.pairplot(data[bill_cols+['default']],hue='default')

<div class='alert alert-block alert-warning' style='font-size:150%'>

### Box plot


</div>

In [None]:
# Box plot of age feature

fig, ax = plt.subplots()
ax.boxplot(data['age'])
ax.set_title('Box plot for age feature')

In [None]:
### Box plot of limit_bal feature

fig, ax = plt.subplots()
ax.boxplot(data['limit_bal'])
ax.set_title('Box plot for limit_bal feature')

### Grouped boxplot

In [None]:
data['bill6_pay5'] = data['bill_amt6'] - data['pay_amt5']

fig,ax = plt.subplots(figsize=(10,5))
sns.boxplot(x='education',y='bill6_pay5',data=data,showfliers=False)

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
sns.boxplot(x='education',y='bill6_pay5',data=data,showfliers=False,hue='default')

<div class='alert alert-block alert-success' style="font-weight:bolder">

### Task 4

### 4a - Plot boxplot for 'default' and 'bill_mnt' for all months

### 4b - Plot boxplot for 'default' and difference between 'bill_mnt_n' and 'pay_mnt_(n-1)' for all months

</div>

In [None]:
# please implement task 4 here

# task 4a
bill_amt = [f'bill_amt{i}' for i in range(1,7)]
months = ['September','August','July','June','May','April']

fig,ax = plt.subplots(figsize=(10,5))
hue = [0]*6+[1]*6
y = [data[data['default']==0][bill].tolist() for def_val,bill in zip([0,1]*6,bill_amt*2)]
print([(def_val,bill) for def_val,bill in product([0,1],bill_amt)])
print(type(y),type(y[0]),len(y))
#print([y[i].dtypes for i in range(len(y))])
"""
y = [data[data['default']==0]['bill_amt1'],
    data[data['default']==0]['bill_amt2'],
    data[data['default']==0]['bill_amt3'],
    data[data['default']==0]['bill_amt4'],
    data[data['default']==0]['bill_amt5'],
    data[data['default']==0]['bill_amt6'],
    data[data['default']==0]['bill_amt1'],
    data[data['default']==0]['bill_amt2'],
    data[data['default']==0]['bill_amt3'],
    data[data['default']==0]['bill_amt4'],
    data[data['default']==0]['bill_amt5'],
    data[data['default']==0]['bill_amt6'],
    ]
"""
sns.boxplot(x=months*2,y=y,showfliers=False)

# task 4b
pay_amt = [f'pay_amt{i}' for i in range(1,7)]

<div class='alert alert-block alert-warning' style='font-size:150%'>

### Violin plot


</div>

In [None]:
# Seaborn: Violinplot of 'age' feature

sns.violinplot(x='age',data=data)

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
sns.violinplot(x='education',y='bill6_pay5',data=data,scale='width',split=True,hue='default',showextrema=False)
ax.set_ylim(-100000,300000)

<div class='alert alert-block alert-warning' style='font-size:150%'>

### 3D scatter plot


</div>

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111,projection='3d')
ax.scatter(data['age'],data['bill_amt6'],data['pay_amt5'],c=data['default'])
ax.view_init(30,0)
ax.set_xlabel('age')
ax.set_ylabel('bill_amt6')
ax.set_zlabel('pay_amt5')

<div class='alert alert-block alert-success' style="font-weight:bolder">

### Task 5

### Check 3d scatter plot for interesting patterns with different combination of features

</div>

In [None]:
# please implement task 5 here

