In [25]:
"""
Import needed packages
"""

import numpy as np 
import pandas as pd 
import os 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import TruncatedSVD 


In [14]:
"""
Specify the directories for data. These are universal for our repository-
this is the only thing you should not change.
"""

CLEAN_DATA_DIR = './clean_data'

CLEAN_DATA_FILE = '/clean_credit_data.csv'

CLEAN_DATA_PATH = CLEAN_DATA_DIR + CLEAN_DATA_FILE

"""
The sample Dataset summarizes the usage behavior of about
9000 active credit card holders during the last 6 months
"""
os.listdir(CLEAN_DATA_DIR)

if os.path.isfile(CLEAN_DATA_PATH): 
    print('CLEAN_DATA_PATH is a valid path')
else:
    raise ValueError('DATA_PATH is not valid path')

CLEAN_DATA_PATH is a valid path


In [19]:
data_clean = pd.read_csv(CLEAN_DATA_PATH,index_col = False)
#Must set the correct customer indices as the index, and save to csv with these indices. Then read this in to the csv 
#as the index 

In [26]:

"""
Rescale the data so that each feature has range [0,1]. 
This is called 'standardization' (different from normalization, which is mean zero variance 1). 
"""

data_clean = (data_clean - data_clean.min())/(data_clean.max()-data_clean.min())
print(data_clean.head(10))

    BALANCE  PURCHASES  ONEOFF_PURCHASES  INSTALLMENTS_PURCHASES  \
0  0.002148   0.001945          0.000000                0.004240   
1  0.168169   0.000000          0.000000                0.000000   
2  0.131026   0.015766          0.018968                0.000000   
3  0.042940   0.000326          0.000393                0.000000   
4  0.095038   0.027188          0.000000                0.059257   
5  0.032939   0.144598          0.157076                0.030595   
6  0.095764   0.008895          0.000000                0.019387   
7  0.053296   0.017567          0.016228                0.008889   
8  0.007994   0.026134          0.031442                0.000000   
9  0.067905   0.018763          0.000000                0.040894   

   CASH_ADVANCE  CREDIT_LIMIT  PAYMENTS  MINIMUM_PAYMENTS  PRC_FULL_PAYMENT  
0      0.000000      0.031720  0.003978          0.001826          0.000000  
1      0.136685      0.232053  0.080892          0.014034          0.222222  
2      0.000000  

In [62]:
""" 
Run SVD on Dataset to Obtain Output. Check the percentage variance explained by components before using. 
"""
#Decide rank of initial approximation desired, and the variance threshold used to choose the 
#final number of components 
start_rank = 5
var_threshold = 95 


#Make instance of SVD class from ScikitLearn
SVD = TruncatedSVD(n_components = start_rank)
#Runt the SVD 
SVD.fit(data_clean)
var = SVD.explained_variance_ratio_
total_var = np.array([100*np.sum(var[0:k+1]) for k in range(len(var))])
print('% Var Explained In First '+str(start_rank)+' Components',total_var)

rank = 0
#Get rank which exceeds threshold 
for i in range(len(total_var)):
    if total_var[i] >= var_threshold:
        rank = i+1 
    else:
        pass 
if rank == 0: 
    print('No quantity of components leq to '+str(start_rank)+' can explain '+str(var_threshold)+'% variance')
else:
    print(str(total_var[rank-1])+'% variance '+'explained by '+str(rank)+' components')
    
data_s = SVD.transform(data_clean)


% Var Explained In First 5 Components [62.73747636 88.54310923 92.77752916 96.24256865 98.02367675]
98.02367675381996% variance explained by 5 components
