In [10]:
"""
Import needed packages
"""
import numpy as np 
import pandas as pd 
import os 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import TruncatedSVD 

In [11]:
"""
Specify the directories for data. These are universal for our repository-
this is the only thing you should not change.
"""

CLEAN_DATA_DIR = './clean_data'

CLEAN_DATA_FILE = '/clean_credit_data.csv'

CLEAN_DATA_PATH = CLEAN_DATA_DIR + CLEAN_DATA_FILE

"""
The sample Dataset summarizes the usage behavior of about
9000 active credit card holders during the last 6 months
"""
os.listdir(CLEAN_DATA_DIR)

if os.path.isfile(CLEAN_DATA_PATH): 
    print('CLEAN_DATA_PATH is a valid path')
else:
    raise ValueError('DATA_PATH is not valid path')

CLEAN_DATA_PATH is a valid path


In [12]:
data_clean = pd.read_csv(CLEAN_DATA_PATH,index_col = False)
#Must set the correct customer indices as the index, and save to csv with these indices. Then read this in to the csv 
#as the index 

In [13]:

"""
Rescale the data so that each feature has range [0,1]. 
This is called 'standardization' (different from normalization, which is mean zero variance 1). 
"""

data_clean = (data_clean - data_clean.min())/(data_clean.max()-data_clean.min())
print(data_clean.head(10))

    BALANCE  PURCHASES  ONEOFF_PURCHASES  INSTALLMENTS_PURCHASES  \
0  0.002148   0.001945          0.000000                0.004240   
1  0.168169   0.000000          0.000000                0.000000   
2  0.131026   0.015766          0.018968                0.000000   
3  0.042940   0.000326          0.000393                0.000000   
4  0.095038   0.027188          0.000000                0.059257   
5  0.032939   0.144598          0.157076                0.030595   
6  0.095764   0.008895          0.000000                0.019387   
7  0.053296   0.017567          0.016228                0.008889   
8  0.007994   0.026134          0.031442                0.000000   
9  0.067905   0.018763          0.000000                0.040894   

   CASH_ADVANCE  CREDIT_LIMIT  PAYMENTS  MINIMUM_PAYMENTS  PRC_FULL_PAYMENT  
0      0.000000      0.031720  0.003978          0.001826          0.000000  
1      0.136685      0.232053  0.080892          0.014034          0.222222  
2      0.000000  

In [25]:
""" 
Run SVD on dataset to obtain dimension reduced data. 
Check the percentage variance explained by components and use only those needed
to explain a desired quantity of variance 
"""
#Decide rank of initial approximation desired, and the variance threshold used to choose the 
#final number of components 
start_rank = 7
var_threshold = 95 

#Make instance of SVD class from ScikitLearn
SVD = TruncatedSVD(n_components = start_rank)
#Run the decomposition 
SVD.fit(data_clean)

#Calculate the percentage variance 
var = SVD.explained_variance_ratio_
total_var = np.array([100*np.sum(var[0:k+1]) for k in range(len(var))])
print('% Var Explained In First '+str(start_rank)+' Components',total_var)

#Get rank which explains at least var_threshold percentage variance
rank = 0
for i in range(len(total_var)):
    if total_var[i] >= var_threshold:
        rank = i+1 
        break
    else:
        pass 
if rank == 0: 
    print('No quantity of components leq to '+str(start_rank)+' can explain '+str(var_threshold)+'% variance')
else:
    print(str(total_var[rank-1])+'% variance '+'explained by '+str(rank)+' components')
    
#Calcualte the SVD up to start_rank; truncate to only the amount needed to explain the variance
print(X)
X = SVD.transform(data_clean)
Xred = X[:,0:rank]

#Convert projected data to DataFrame format from np.array 
Xred_pd = pd.DataFrame(Xred,columns = ['Singular Component '+str(i+1) for i in range(rank)])
Xred_pd.head(10)


% Var Explained In First 7 Components [62.73747636 88.54310923 92.77752916 96.24256865 98.02367675 98.93873753
 99.54676156]
96.24256864817026% variance explained by 4 components
[[ 1.09656143e-02  2.48441818e-02  1.62593615e-02 ...  1.11540701e-04
   3.55875431e-03  2.03083867e-03]
 [ 3.07588770e-01  2.19634848e-01 -5.46089399e-02 ...  9.74957877e-02
   5.99131860e-03 -7.82885556e-03]
 [ 9.38009606e-02  2.55603961e-01  4.05479525e-02 ... -2.89636202e-02
  -2.11265627e-02  9.30294807e-04]
 ...
 [ 2.44571775e-01 -5.80476121e-02 -1.55340069e-02 ... -3.73604828e-03
   9.36112789e-04 -5.14269382e-04]
 [ 2.38672283e-01 -7.08264264e-02 -2.54027991e-02 ... -9.53922328e-04
  -3.86309068e-03  1.80786140e-04]
 [ 1.67295967e-02  4.28246693e-02  1.30442129e-02 ... -1.20752395e-02
  -1.73837123e-02 -1.37045552e-03]]


Unnamed: 0,Singular Component 1,Singular Component 2,Singular Component 3,Singular Component 4
0,0.010966,0.024844,0.016259,-0.005681
1,0.307589,0.219635,-0.054609,-0.024173
2,0.093801,0.255604,0.040548,-0.052467
3,0.017333,0.054126,-0.01117,-0.002788
4,0.034871,0.108622,-0.032547,0.034559
5,1.108746,0.054539,0.144473,0.059051
6,0.03541,0.112478,-0.028551,-0.002098
7,0.081934,0.199959,0.091647,-0.04718
8,0.121626,0.270934,0.201903,-0.07947
9,0.023948,0.075755,-0.025987,0.026185


In [33]:
"""
Run a single kmeans clustering- define all items desired before iterating to obtain optimal clustering 
"""

K = 10 
kmeans = KMeans(n_clusters=K, random_state=0).fit(Xred)
centers = kmeans.cluster_centers_

#Get predictions for all rows of reduced data
Xred_labels = kmeans.predict(Xred)
Xred_pd['cluster'] = Xred_labels 
Xred_pd.head(10)
             
#Push back the labels to the original data   
data_clean['cluster'] = Xred_labels 
data_clean.head(10)


Unnamed: 0,BALANCE,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,cluster
0,0.002148,0.001945,0.0,0.00424,0.0,0.03172,0.003978,0.001826,0.0,5
1,0.168169,0.0,0.0,0.0,0.136685,0.232053,0.080892,0.014034,0.222222,3
2,0.131026,0.015766,0.018968,0.0,0.0,0.248748,0.012263,0.00821,0.0,7
3,0.04294,0.000326,0.000393,0.0,0.0,0.038397,0.013373,0.003204,0.0,5
4,0.095038,0.027188,0.0,0.059257,0.0,0.058431,0.027602,0.031506,0.0,5
5,0.032939,0.144598,0.157076,0.030595,0.0,0.449082,0.125278,0.002592,1.0,6
6,0.095764,0.008895,0.0,0.019387,0.0,0.075125,0.013387,0.006963,0.0,5
7,0.053296,0.017567,0.016228,0.008889,0.0,0.232053,0.013569,0.004083,0.0,7
8,0.007994,0.026134,0.031442,0.0,0.0,0.365609,0.022963,0.001312,0.0,7
9,0.067905,0.018763,0.0,0.040894,0.0,0.038397,0.021357,0.028436,0.0,5


In [34]:
"""
Run a single GMM model before iterating to obtain optimal model 
"""

M = 10 
gm = GaussianMixture(n_components=M, random_state=0).fit(X)
gm_centers = gm.means_
print(gm_centers)


[[ 4.47939501e-01 -7.82650513e-02 -7.99154421e-03 -2.66855126e-02
  -4.97579970e-03  3.75521204e-03 -8.00045762e-06]
 [ 6.72535459e-02  1.92147748e-01  1.90264911e-03 -2.04745185e-02
  -7.44498339e-03 -2.22115999e-03 -2.31072508e-03]
 [ 5.92827453e-01 -3.33323032e-02  3.72411261e-02 -3.47013196e-02
  -6.78173421e-03  3.18001907e-03  1.68866572e-03]
 [ 1.74447678e-01  9.50911131e-02  1.61238328e-02  2.47391299e-02
   1.10982367e-02  4.23024487e-03 -4.36692342e-04]
 [ 3.01507047e-02  9.28443483e-02 -1.52630087e-02 -1.26789282e-02
  -7.40644899e-04 -3.17156399e-03 -1.56798916e-03]
 [ 1.54006583e-01  4.30564149e-02 -1.76275437e-02  3.74651130e-03
   3.97283639e-02  1.41902395e-03  1.47654785e-03]
 [ 1.05857437e-01  3.10868245e-01 -2.77171976e-02  1.92778855e-02
  -2.43608335e-02  1.20397660e-02  1.56457166e-02]
 [ 4.28815810e-01  2.85247676e-01  3.99175461e-02  2.06797598e-01
   3.23069370e-02  2.08175931e-02  8.19236319e-03]
 [ 2.96211069e-01  2.34508046e-01 -1.67894844e-02  8.60018444e-0