### Data Load

In [108]:
import pandas as pd

# Read the csv dataset
df = pd.read_csv('accounts.csv', delimiter =',')

### 1

In [None]:
%config InlineBackend.figure_formats = ['svg']

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

max_iter = 500
random_state = 42
k = [2, 3, 4, 5, 6, 7, 8]

# Prepare the data
X = df.drop(columns=[_ for _ in [df.columns[i] for i in range(8,df.shape[1])]])
X = X.drop_duplicates()
X = X.dropna()
X[['age', 'balance']] = MinMaxScaler().fit_transform(X[['age', 'balance']])
X = pd.get_dummies(X, drop_first=True)

# Apply k-means clustering and compute the SSE
SSE = []
for _ in k:
    kmeans = KMeans(n_clusters=_, init='random', max_iter=max_iter, random_state=random_state)
    kmeans.fit(X)
    SSE.append(kmeans.inertia_)

# Plot the graph
plt.plot(k, SSE)
plt.xlabel('k clusters')
plt.ylabel('SSE')
plt.title('k-means SSE')
plt.show()

### 2

#### a)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Prepare the data
X = df.drop(columns=[_ for _ in [df.columns[i] for i in range(8,df.shape[1])]])
X = X.drop_duplicates()
X = X.dropna()
X[['age', 'balance']] = StandardScaler().fit_transform(X[['age', 'balance']])
X = pd.get_dummies(X, drop_first=True)

# Apply PCA
pca = PCA(n_components=2)
X_new = pca.fit_transform(X)
explained_var = pca.explained_variance_ratio_
print(f'Variability explained by the top 2 components: {explained_var[0].round(3)} + {explained_var[1].round(3)} = {(explained_var[0].round(3)+explained_var[1].round(3))}')

#### b)

In [None]:
# Version with data scaled

import seaborn as sns

k = 3
random_state = 42

component1 = [_[0] for _ in X_new]
component2 = [_[1] for _ in X_new]

# Apply k-means clustering with k=3
kmeans = KMeans(n_clusters=k, random_state=random_state)
clusters = kmeans.fit_predict(X)

# Plot scatterplot
plot = sns.scatterplot(x=component1, y=component2, hue=clusters)
plot.legend(title='Cluster')

plt.title('Scatterplot of 3 clusters based on the top 2 components')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

In [None]:
# Version with the data not scaled
k = 3
random_state = 42

component1 = [_[0] for _ in X_new]
component2 = [_[1] for _ in X_new]

# Prepare the data
X = df.drop(columns=[_ for _ in [df.columns[i] for i in range(8,df.shape[1])]])
X = X.drop_duplicates()
X = X.dropna()
X = pd.get_dummies(X, drop_first=True)

# Apply k-means clustering with k=3
kmeans = KMeans(n_clusters=k, random_state=random_state)
clusters = kmeans.fit_predict(X)

# Plot scatterplot
plot = sns.scatterplot(x=component1, y=component2, hue=clusters)
plot.legend(title='Cluster')

plt.title('Scatterplot of 3 clusters based on the top 2 components')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

#### c)

In [None]:
%config InlineBackend.figure_formats = ['svg']

import seaborn as sns

X = df.drop(columns=[_ for _ in [df.columns[i] for i in range(8,df.shape[1])]])
X = X.drop_duplicates()
X = X.dropna()
X['cluster'] = clusters

plot = sns.displot(data=X, y='job', hue='cluster', multiple='dodge', stat='density', shrink=0.8, common_norm=False)
plt.show()
sns.displot(data=X, y='education', hue='cluster', multiple='dodge', stat='density', shrink=0.8, common_norm=False)
plt.show()