In [8]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


<div dir=rtl style="direction: rtl;text-align: right;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>
    
|ستون|توضیحات|
|------|---|
|CustomerID|شناسه مشتری|
|Gender|جنسیت مشتری|
|Age|سن مشتری|
|Annual Income (k$)|درامد سالانه مشتری بر حسب هزار دلار|
|Spending Score (1-100)|میزان تمایل مشتری به خرید. هر چه عدد بزرگتر باشد، مشتری تمایل بیشتری برای پرداخت پول دارد!|

</font>
</div>



In [9]:
df = pd.read_csv('../data/mall_customers.csv')
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [10]:
class Preprocessor : 
    def __init__ (self, df):
        self.df = df.copy()
    
    def handle_missing_values(self):
        self.df.fillna(0, inplace=True)
        
    def encoder(self):
        enc=LabelEncoder()
        self.df.Gender=enc.fit_transform(self.df.Gender)
    
    def scale(self):
        scaler=MinMaxScaler()
        for i in self.df.columns:
            self.df[[i]]=scaler.fit_transform(self.df[[i]])
      
    def transform (self) : 
        self.handle_missing_values()
        self.encoder()
        self.scale()
        return self.df

In [11]:
pre_pro=Preprocessor(df)
pre_pro.handle_missing_values()
pre_pro.encoder()
pre_pro.scale()
df=pre_pro.transform()
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,0.000000,1.0,0.019231,0.000000,0.387755
1,0.005025,1.0,0.057692,0.000000,0.816327
2,0.010050,0.0,0.038462,0.008197,0.051020
3,0.015075,0.0,0.096154,0.008197,0.775510
4,0.020101,0.0,0.250000,0.016393,0.397959
...,...,...,...,...,...
195,0.979899,0.0,0.326923,0.860656,0.795918
196,0.984925,0.0,0.519231,0.909836,0.275510
197,0.989950,1.0,0.269231,0.909836,0.744898
198,0.994975,1.0,0.269231,1.000000,0.173469


In [12]:
# modeling
model=KMeans(n_clusters=2,n_init='auto')
model.fit(df)
y_p=model.labels_

In [13]:
# evaluate model
si=silhouette_score(df,y_p)
(si+1)/2

0.7257851688528456