In [1]:
import numpy as np
import pandas as pd
import os, time, re
import pickle, gzip

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import matplotlib as mpl

%matplotlib inline

In [2]:
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import fastcluster
from scipy.cluster.hierarchy import dendrogram, cophenet, fcluster
from scipy.spatial.distance import pdist

In [3]:
file = 'D:\\Python\\Machine Learning\\Unsupervised Learning using python Book 4\\datasets\\LoanStats3a.csv'
data = pd.read_csv(file)

columnsToKeep = ['loan_amnt','funded_amnt','funded_amnt_inv','term', \
                 'int_rate','installment','grade','sub_grade', \
                 'emp_length','home_ownership','annual_inc', \
                 'verification_status','pymnt_plan','purpose', \
                 'addr_state','dti','delinq_2yrs','earliest_cr_line', \
                 'mths_since_last_delinq','mths_since_last_record', \
                 'open_acc','pub_rec','revol_bal','revol_util', \
                 'total_acc','initial_list_status','out_prncp', \
                 'out_prncp_inv','total_pymnt','total_pymnt_inv', \
                 'total_rec_prncp','total_rec_int','total_rec_late_fee', \
                 'recoveries','collection_recovery_fee','last_pymnt_d', \
                 'last_pymnt_amnt']

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data = data.loc[:, columnsToKeep]
data.shape

(42542, 37)

In [5]:
data.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt
0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,10+ years,RENT,...,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,Jan-15,171.62
1,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,< 1 year,RENT,...,0.0,1014.53,1014.53,456.46,435.17,0.0,122.9,1.11,Apr-13,119.66
2,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,10+ years,RENT,...,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,Jun-14,649.91
3,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,10+ years,RENT,...,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,Jan-15,357.48
4,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,1 year,RENT,...,0.0,4066.908161,4066.91,3000.0,1066.91,0.0,0.0,0.0,Jan-17,67.3


In [6]:
data[['term', 'int_rate', 'emp_length', 'revol_util']].head()

Unnamed: 0,term,int_rate,emp_length,revol_util
0,36 months,10.65%,10+ years,83.70%
1,60 months,15.27%,< 1 year,9.40%
2,36 months,15.96%,10+ years,98.50%
3,36 months,13.49%,10+ years,21%
4,60 months,12.69%,1 year,53.90%


In [7]:
for i in ['term', 'int_rate', 'emp_length', 'revol_util']:
    data.loc[:, i] = data.loc[:, i].apply(lambda x: re.sub("[^0-9]", "", str(x)))
    data.loc[:, i] = pd.to_numeric(data.loc[:, i])

In [8]:
data[['term', 'int_rate', 'emp_length', 'revol_util']].head()

Unnamed: 0,term,int_rate,emp_length,revol_util
0,36.0,1065.0,10.0,8370.0
1,60.0,1527.0,1.0,940.0
2,36.0,1596.0,10.0,9850.0
3,36.0,1349.0,10.0,21.0
4,60.0,1269.0,1.0,5390.0


In [9]:
numericalFeats = [x for x in data.columns if data[x].dtype != 'object']

nanCounter = np.isnan(data.loc[:, numericalFeats]).sum()
nanCounter

loan_amnt                      7
funded_amnt                    7
funded_amnt_inv                7
term                           7
int_rate                       7
installment                    7
emp_length                  1119
annual_inc                    11
dti                            7
delinq_2yrs                   36
mths_since_last_delinq     26933
mths_since_last_record     38891
open_acc                      36
pub_rec                       36
revol_bal                      7
revol_util                    97
total_acc                     36
out_prncp                      7
out_prncp_inv                  7
total_pymnt                    7
total_pymnt_inv                7
total_rec_prncp                7
total_rec_int                  7
total_rec_late_fee             7
recoveries                     7
collection_recovery_fee        7
last_pymnt_amnt                7
dtype: int64

In [10]:
fillWithMean = ['loan_amnt','funded_amnt','funded_amnt_inv','term', \
                'int_rate','installment','emp_length','annual_inc',\
                'dti','open_acc','revol_bal','revol_util','total_acc',\
                'out_prncp','out_prncp_inv','total_pymnt', \
                'total_pymnt_inv','total_rec_prncp','total_rec_int', \
                'last_pymnt_amnt']

fillWithZero = ['delinq_2yrs','mths_since_last_delinq', \
                'mths_since_last_record','pub_rec','total_rec_late_fee', \
                'recoveries','collection_recovery_fee']

from sklearn.impute import SimpleImputer

im = SimpleImputer(missing_values=np.nan, strategy='mean')
data.loc[:, fillWithMean] = im.fit_transform(data[fillWithMean])
data.loc[:, fillWithZero] = data.loc[:, fillWithZero].fillna(value=0, axis=1)

In [11]:
numericalFeats = [x for x in data.columns if data[x].dtype != 'object']

nanCounter = np.isnan(data.loc[:, numericalFeats]).sum()
nanCounter

loan_amnt                  0
funded_amnt                0
funded_amnt_inv            0
term                       0
int_rate                   0
installment                0
emp_length                 0
annual_inc                 0
dti                        0
delinq_2yrs                0
mths_since_last_delinq     0
mths_since_last_record     0
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util                 0
total_acc                  0
out_prncp                  0
out_prncp_inv              0
total_pymnt                0
total_pymnt_inv            0
total_rec_prncp            0
total_rec_int              0
total_rec_late_fee         0
recoveries                 0
collection_recovery_fee    0
last_pymnt_amnt            0
dtype: int64

In [12]:
data['installmentOverLoadAmnt'] = data.installment/data.loan_amnt
data['loanAmntOverIncome'] = data.loan_amnt/data.annual_inc
data['revol_balOverIncome'] = data.revol_bal/data.annual_inc
data['totalPymntOverIncome'] = data.total_pymnt/data.annual_inc
data['totalPymntInOverIncome'] = data.total_pymnt_inv/data.annual_inc
data['totalRecPrncpOverIncome'] = data.total_rec_prncp/data.annual_inc
data['totalRecIncOverIncome'] = data.total_rec_int/data.annual_inc

newFeats = ['installmentOverLoadAmnt', 'loanAmntOverIncome', 'revol_balOverIncome', 'totalPymntOverIncome', 
           'totalPymntInOverIncome', 'totalRecPrncpOverIncome', 'totalRecIncOverIncome']

In [13]:
numericalPlusNewFeats = numericalFeats + newFeats
X_train = data.loc[:, numericalPlusNewFeats]

sX = pp.StandardScaler()
X_train.loc[:, :] = sX.fit_transform(X_train)

In [19]:
labels = data.grade
labels.unique()

array(['B', 'C', 'A', 'E', 'F', 'D', 'G', nan], dtype=object)

In [15]:
labels = labels.fillna(value='Z')

lbl = pp.LabelEncoder()
lbl.fit(list(labels.values))
labels = pd.Series(data=lbl.transform(labels.values), name='grade')

In [16]:
y_train = labels

labelsOriginalVSNew = pd.concat([labels, data.grade], axis=1)
labelsOriginalVSNew

Unnamed: 0,grade,grade.1
0,1,B
1,2,C
2,2,C
3,2,C
4,1,B
...,...,...
42537,0,A
42538,7,
42539,7,
42540,7,


In [17]:
interestAndGrade = pd.DataFrame(data=[data.int_rate, labels])
interestAndGrade = interestAndGrade.T

interestAndGrade.groupby('grade').mean()

Unnamed: 0_level_0,int_rate
grade,Unnamed: 1_level_1
0.0,734.270844
1.0,1101.420857
2.0,1349.988902
3.0,1557.714927
4.0,1737.676783
5.0,1926.530361
6.0,2045.125
7.0,1216.501563


In [18]:
interestAndGrade = pd.DataFrame(data=[data.int_rate, labels])
interestAndGrade

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42532,42533,42534,42535,42536,42537,42538,42539,42540,42541
int_rate,1065.0,1527.0,1596.0,1349.0,1269.0,790.0,1596.0,1864.0,2128.0,1269.0,...,964.0,1028.0,964.0,933.0,838.0,775.0,1216.501563,1216.501563,1216.501563,1216.501563
grade,1.0,2.0,2.0,2.0,1.0,0.0,2.0,4.0,5.0,1.0,...,1.0,2.0,1.0,1.0,0.0,0.0,7.0,7.0,7.0,7.0


In [21]:
def analyzeCluster(clusterDF, labelsDF):
    countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
    countByCluster.reset_index(inplace=True, drop=False)
    countByCluster.columns = ['cluster', 'clusterCount']
    
    preds = pd.concat([labelsDF, clusterDF], axis=1)
    preds.columns = ['trueLabel', 'cluster']
    
    countByLabel = pd.DataFrame(data=preds.groupby('trueLabel').count())
    
    countMostFreq = pd.DataFrame(data=preds.groupby('cluster').agg(lambda x: x.value_counts().iloc[0]))
    countMostFreq.reset_index(inplace=True, drop=False)
    countMostFreq.columns = ['cluster', 'countMostFrequent']
    
    accuracyDF = countMostFreq.merge(countByCluster, left_on='cluster', right_on='cluster')
    
    overallAccuracy = accuracyDF.countMostFrequent.sum() / accuracyDF.clusterCount.sum()
    
    accuracyByLabel = accuracyDF.countMostFrequent / accuracyDF.clusterCount
    
    return countByCluster, countByLabel, countMostFreq, accuracyDF, overallAccuracy, accuracyByLabel

## k-Means

In [None]:
from sklearn.cluster import KMeans

n_clusters = 10
n_init = 10
max_iter = 300
tol = 0.0001
random_state = 2018
n_jobs = 2

kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, tol=tol, random_state=random_state, n_jobs=n_jobs)

kMeans_inertia = pd.DataFrame(data=[], index=range(10, 31), columns=['inertia'])
overallAccuracy_kMeansDF = pd.DataFrame(data=[], index=range(10, 31), columns=['overallAccuracy'])

for n_clusters in range(10, 31):
    kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, tol=tol, random_state=random_state, n_jobs=n_jobs)

    kmeans.fit(X_train)
    kMeans_inertia.loc[n_clusters] = kmeans.inertia_
    X_train_kmeansClustered = kmeans.predict(X_train)
    X_train_kmeansClustered = pd.DataFrame(data=X_train_kmeansClustered, index=X_train.index, columns=['cluster'])

    countByCluster_kMeans, countByLabel_kMeans, countMostFreq_kMeans, accuracyDF_kMeans, oerallAccuracy_kMeans, \
    accuracyByLabel_kMeans = analyzeCluster(X_train_kmeansClustered, y_train)

    overallAccuracy_kMeansDF.loc[n_clusters] = overallAccuracy_kMeans

overallAccuracy_kMeansDF.plot()

## Hierarchical Clustering

In [None]:
import fastcluster
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.distance import pdist

z = fastcluster.linkage_vector(X_train, method='ward', metric='euclidean')

z_dataFrame = pd.DataFrame(data=z, columns=['clusterOne', 'clusterTwo', 'distance', 'newClusterSize'])


In [None]:
from scipy.cluster.hierarchy import fcluster

distance_threshold = 100
clusters = fcluster(z, distance_threhold, criterion='distance')
X_train_hierClustered = pd.DataFrame(data=clusters, index=X_train_PCA.index, columns=['cluster'])

print('Number of distinct clusters: ', len(X_train_hierClustered['cluster'].unique()))