In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics

plt.style.use('ggplot')

## Question 1

In [None]:
# Load data.
df_1 = pd.read_csv('./HW3_1_data.csv')


In [None]:
# Create numpy arrays for features and target
X_1 = df_1.iloc[:,:].values


### KMeans

In [None]:
# Generate kmeans list and fit them
kmeans_models = [KMeans(n_clusters=k, random_state=0).fit(X_1) for k in range (1, 25)]
innertia = [model.inertia_ for model in kmeans_models]
# Plot the graph of n versus innertia
plt.plot(range(1, 25), innertia)
plt.title('Elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('Innertia')
plt.show()


In [None]:
# Since the elbow is coming at range 4 to 7, calculate the silhouette score and
# choose the best kmeans model.
silhoutte_scores = [metrics.silhouette_score(X_1, model.labels_) for model in kmeans_models[3:7]]
model_clusters = [model.n_clusters for model in kmeans_models[3:7]]
plt.plot(model_clusters, silhoutte_scores, "bo-")
plt.xticks(model_clusters)
plt.title('Silhoutte scores vs Number of clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhoutte score')
plt.show()

In [None]:
# As shown in the figure above, the k-means algorithm selects N_Clusters = 4
# to maximize silhouette score
kmeans = KMeans(n_clusters=4)
y_pred = kmeans.fit_predict(X_1)
plt.scatter(X_1[:, 0], X_1[:, 1], c=y_pred)
s_score = metrics.silhouette_score(X_1, kmeans.labels_)
plt.title("n_clusters: %d, silhouette_score: %.4f"%(kmeans.n_clusters, s_score) )
plt.show()

In [None]:
# Show the calinski_harabasz_score and silhouette_score
c_score = metrics.calinski_harabasz_score(X_1, y_pred)
s_score = metrics.silhouette_score(X_1, kmeans.labels_)
print("kmeans \n n_clusters: 4\n calinski_harabasz_score: %.4f\n silhouette_score: %.4f"
      %(c_score,s_score))

### DBSCAN

In [None]:
# Change the eps parameter and the min_sample parameter and calculate the
# silhouette score. Then choose the best DBSCAN model with highest score.
model = None
best_socre = -1
eps_list = np.arange(1,3,0.1)
sample_list = [1,5,10]
for _eps in eps_list:
    for _min_sample in sample_list:
        dbscan = DBSCAN(eps=_eps, min_samples=_min_sample)
        y_pred = dbscan.fit_predict(X_1)
        s_score = metrics.silhouette_score(X_1, dbscan.labels_)
        if s_score > best_socre:
            best_socre = s_score
            model = dbscan
# Show the parameter and score of best model
print("eps: %.4f, min_samples: %d,\n silhouette_score: %.4f"%
      (model.eps, model.min_samples, best_socre) )


In [None]:
# Plot the result of DBSCAN
y_pred = model.fit_predict(X_1)
plt.scatter(X_1[:,0], X_1[:,1], c=y_pred)
s_score = metrics.silhouette_score(X_1, model.labels_)
plt.title("eps: %.4f, min_samples: %d,\n silhouette_score: %.4f"
          %(model.eps, model.min_samples, s_score) )

plt.show()


In [None]:
# Show the calinski_harabasz_score and silhouette_score
c_score = metrics.calinski_harabasz_score(X_1, y_pred)
s_score = metrics.silhouette_score(X_1, model.labels_)
print("DBSCAN \n eps: %.4f, min_samples: %d\n calinski_harabasz_score: %.4f\n silhouette_score: %.4f"
      %(model.eps, model.min_samples,c_score,s_score))

## Question 2

### Data Cleaning

In [None]:
# Load data
df_2 = pd.read_csv('./HW3_2_data.csv')
df_2.head()

In [None]:
# Show the summary statistics of the data provided
df_2.describe()

In [None]:
# Show the full summary
df_2.info()

In [None]:
# Customer ID seems to be an unique id for each customer and hence won't play
# any role in determining the cluster. Delete this columns.
df_2.drop(['CUST_ID'], axis=1, inplace=True)

In [None]:
# Show the proportion of NaN values of each column
df_2.isna().sum()

In [None]:
# Use mean to replace the Nan values in CREDIT_LIMIT and MINIMUM_PAYMENTS
df_2['MINIMUM_PAYMENTS'].fillna(df_2['MINIMUM_PAYMENTS'].mean(skipna=True), inplace=True)
df_2['CREDIT_LIMIT'].fillna(df_2['CREDIT_LIMIT'].mean(skipna=True), inplace=True)

In [None]:
# Normalized data
X_2 = df_2.copy()
scaler = StandardScaler()
X_2 = scaler.fit_transform(X_2)
X_2.shape


### Train model

In [None]:
# plot the inertia values for each iteration that happens in a Kmeans run.
clusters = 25
cost = []
kmeans_models = [KMeans(n_clusters=k, random_state=23).fit(X_2) for k in range (1,25)]
innertia = [model.inertia_ for model in kmeans_models]

plt.plot(range(1, 25), innertia)
plt.title('Elbow method')
plt.xlabel('Number of Clusters')
plt.ylabel('Innertia')
plt.show()

In [None]:
# Since the elbow is coming at range 2 to 7, calculate the silhouette score and
# choose the best kmeans model.
silhoutte_scores = [metrics.silhouette_score(X_2, model.labels_) for model in kmeans_models[1:7]]
model_clusters = [model.n_clusters for model in kmeans_models[1:7]]
plt.plot(model_clusters, silhoutte_scores, "bo-")
plt.xticks(model_clusters)
plt.title('Silhoutte scores vs Number of clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhoutte score')
plt.show()

In [None]:
# As shown in the figure above, the k-means algorithm selects N_Clusters = 3
# to maximize silhouette score
k_means = kmeans_models[2]
y_pred = k_means.fit_predict(X_2)
labels = k_means.labels_

In [None]:
# Show the calinski_harabasz_score and silhouette_score
c_score = metrics.calinski_harabasz_score(X_2, y_pred)
s_score = metrics.silhouette_score(X_2, k_means.labels_)
print("kmeans \n n_clusters: %d\n calinski_harabasz_score: %.4f\n silhouette_score: %.4f"
      %(k_means.n_clusters,c_score,s_score))