### Case 1

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

import plotly as py
import plotly.graph_objs as go

import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing 
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 

### (Data) 고객 구매 데이터 from kaggle
- CustomerID : 고객 ID
- Gender : 성별
- Age : 나이
- Annual Income (k$) : 연간 수입
- Spending Score (1-100) : 지출 지수

In [None]:
df = pd.read_csv('./data/Mall_Customers.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

#### 일반 데이터 셋

In [None]:
df['Gender'].replace({'Male':0, 'Female':1}, inplace=True)
df.head()

- Encoding

In [None]:
label_encoder = preprocessing.LabelEncoder() 

df['Gender'] = label_encoder.fit_transform(df['Gender'])
# replace 방법 : 위 방법과 차이가 있다.
#df['Gender'].replace({'Male':0, 'Female':1}, inplace=True)
df.head()

#### StandardScaler 실행 셋

In [None]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(df), 
                              columns=df.columns)

In [None]:
scaled_df.head()

#### Case1. 기본 결과

In [None]:
plt.figure(1, figsize = (16 ,8))

hierachical_result = sch.linkage(df, method  = "ward")
dendrogram = sch.dendrogram(hierachical_result)

plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

#### Case2. 정규화 결과

In [None]:
plt.figure(1, figsize = (16 ,8))

hierachical_result = sch.linkage(scaled_df, method  = "average", metric='euclidean')
dendrogram = sch.dendrogram(hierachical_result)

plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
hc = AgglomerativeClustering(n_clusters = 5, metric = 'euclidean', linkage ='average')

y_hc = hc.fit_predict(df)
y_hc

In [None]:
df['cluster'] = pd.DataFrame(y_hc)

In [None]:
y_hc

In [None]:
X = df.iloc[:, [3,4]].values
plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster 1')
plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster 2')
plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster 3')
plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster 4')
plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster 5')
plt.title('Clusters of Customers (Hierarchical Clustering Model)')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100)')
plt.show()