In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans, DBSCAN, MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from pyclustering.cluster.clarans import clarans
from pyclustering.cluster import cluster_visualizer_multidim
from pyclustering.utils import timedcall
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Compare the clustering results with N (where 2<= N <= 10) quantiles of the medianHouseValue feature values in the original dataset.
def compareWithOriginalLabels(target_label, predict):
    print('='*10,'Compare with original labels','='*10)
    concatDf = pd.concat([target_label, predict],axis=1)
    print('===count===')
    print(concatDf.groupby('predict')['median_house_value'].count())
    print('===max===')
    print(concatDf.groupby('predict')['median_house_value'].max())
    print('===median===')
    print(concatDf.groupby('predict')['median_house_value'].median())
    print('===min===')
    print(concatDf.groupby('predict')['median_house_value'].min())
    print('===mean===')
    print(concatDf.groupby('predict')['median_house_value'].mean())

In [None]:
def findBestParameter(n_clusters, df, method=None, metrics=['manhattan','euclidean']):
    # Initialize variables
    range_of_clusters = list(range(2,n_clusters+1))
    silhouette_avg_manhattan=[]
    silhouette_avg_euclidean=[]

    # Exception handling
    if method == None:
        print('[Error]: No method specified')
        return

    print('Start calculating silhouette_score...( method =',method,')')
    for metric in metrics:
        for k in range_of_clusters:
            # print('Calculating silhouette_score ( k =',k,')')

            # method=KMeans
            if method == 'KMeans':
                # initialize kmeans
                models = KMeans(n_clusters = k).fit(df)
                labels = models.labels_

                # save silhouette score
                if metric == 'euclidean':
                    silhouette_avg_euclidean.append(silhouette_score(df,
                    labels, metric=metric))
                elif metric == 'manhattan':
                    silhouette_avg_manhattan.append(silhouette_score(df,
                    labels, metric=metric))

            # method=GMM
            elif method == 'GMM':
                # initialize GMM
                labels = GaussianMixture(n_components = k).fit_predict(df)

                # save silhouette score
                if metric == 'euclidean':
                    silhouette_avg_euclidean.append(silhouette_score(df,
                    labels, metric=metric))
                elif metric == 'manhattan':
                    silhouette_avg_manhattan.append(silhouette_score(df,
                    labels, metric=metric))

    # plotting graph (manhattan)
    plt.clf()
    plt.plot(range_of_clusters,silhouette_avg_manhattan,'bx-')
    plt.xlabel('Values of K') 
    plt.ylabel('Silhouette score (manhattan)') 
    plt.title('Silhouette analysis For Optimal k')
    plt.show()

    # plotting graph (euclidean)
    plt.clf()
    plt.plot(range_of_clusters,silhouette_avg_euclidean,'bx-')
    plt.xlabel('Values of K') 
    plt.ylabel('Silhouette score (euclidean)') 
    plt.title('Silhouette analysis For Optimal k')
    plt.show()

    # return 2 best score
    return_bestK=[]
    for i in range(2):
        k_num = silhouette_avg_euclidean.index(max(silhouette_avg_euclidean))
        return_bestK.append(k_num+2)
        silhouette_avg_euclidean[k_num]=0
    return return_bestK


In [None]:
def AutoML(dataset, target_label, encoder_list=[], scaler_list=[], model_list_and_params=[]):
    # AutoML Start
    print('='*15,'AutoML Start','='*15)

    # Encoding ---------------------------------------
    for encoder in encoder_list:
        print('='*15,'Encoding (',encoder,')','='*15)
        # Initialize dataset
        df = dataset

        # Extract categorical feature
        needEncodeArray = df[['ocean_proximity']]

        # Encode
        encoded_array = encoder.fit_transform(needEncodeArray)

        # Concatenate needEncodeArray & df
        df = df.drop(['ocean_proximity'], axis=1)
        encoded_df = pd.concat([df,pd.DataFrame(encoded_array, columns=['ocean_proximity'])],axis=1)
        # display(encoded_df)

        # Scaling ------------------------------------
        for scaler in scaler_list:
            print('='*15,'Scaling (',scaler,')','='*15)
            scaled_df = scaler.fit_transform(encoded_df)
            scaled_df = pd.DataFrame(scaled_df,columns=encoded_df.columns)
            # display(scaled_df)
            # print(scaled_df.info())

            # Modeling with besk k -------------------
            for modelName in model_list_and_params:
                # Get modelName and model_params
                model_params = model_list_and_params.get(modelName)
                # print(modelName)
                # print(model_params)
                
                # K-means
                if modelName == 'KMeans':
                    print('='*15,'Model :',modelName,'='*15)
                    # Find optimal parameter(k) using silhouette score
                    bestK_s = findBestParameter(12,scaled_df,modelName,['manhattan','euclidean'])
                    print('best K_s =',bestK_s)

                    # Get parameters
                    max_iters = model_params.get('max_iter')
                    algorithms = model_params.get('algorithm')

                    for bestK in bestK_s:
                        for max_iter in max_iters:
                            for algorithm in algorithms:

                                # Make model and fit
                                model = KMeans(n_clusters=bestK, max_iter=max_iter, algorithm=algorithm, random_state=12)
                                predict = pd.DataFrame(model.fit_predict(scaled_df))
                                predict.columns = ['predict']
                                r = pd.concat([scaled_df,predict],axis=1)
                                print('max_iter =',max_iter, '/ algorithm =',algorithm, '/ k =',bestK, "Done.")

                                # Plotting with pairplot
                                plt.clf()
                                sns.pairplot(r,hue='predict')
                                plt.show()

                                # Compare predictions with origianl target_label
                                compareWithOriginalLabels(target_label,predict)
                
                # GMM
                elif modelName == 'GMM':
                    print('='*15,'Model :',modelName,'='*15)
                    # Find optimal parameter(k) using silhouette score
                    bestK_s = findBestParameter(12,scaled_df,modelName,['manhattan','euclidean'])
                    print('best K_s =',bestK_s)

                    # Get parameters
                    covariance_types = model_params.get('covariance_type')
                    init_params = model_params.get('init_params')

                    for bestK in bestK_s:
                        for covariance_type in covariance_types:
                            for init_param in init_params:

                                # Make model and fit_predict
                                model = GaussianMixture(n_components=bestK, covariance_type=covariance_type, init_params=init_param, random_state=12)
                                predict = pd.DataFrame(model.fit_predict(scaled_df))
                                predict.columns = ['predict']
                                r = pd.concat([scaled_df,predict],axis=1)
                                print('covariance_type =',covariance_type, '/ init_params =',init_param, '/ k =',bestK, "Done.")

                                # Plotting with pairplot
                                plt.clf()
                                sns.pairplot(r,hue='predict')
                                plt.show()

                                # Compare predictions with origianl target_label
                                compareWithOriginalLabels(target_label,predict)

                # CLARANS
                elif modelName == 'CLARANS':
                    print('='*15,'Model :',modelName,'='*15)
                    # Convert dataset to list
                    scaled_df = scaled_df[:200]
                    scaled_df_list = scaled_df.values.tolist()

                    # Get parameters
                    n_clusters = model_params.get('number_clusters')
                    numlocals = model_params.get('numlocal')
                    maxneighbors = model_params.get('maxneighbor')

                    for clusterNum in n_clusters:
                        for numlocal in numlocals:
                            for maxneighbor in maxneighbors:

                                print("CLARANS Processing...")
                                # Make model and fit_predict
                                model = clarans(scaled_df_list, clusterNum, numlocal, maxneighbor)
                                # result = model.process
                                result = model.process()
                                # print("Execution time :",ticks)
                                print("CLARANS Processed.")

                                # Get results
                                clusters = model.get_clusters()
                                medoids = model.get_medoids()

                                # Plotting
                                vis = cluster_visualizer_multidim();
                                vis.append_clusters(clusters,scaled_df_list,marker="*",markersize=2);
                                vis.show();

                # DBSCAN
                elif modelName == 'DBSCAN':
                    print('='*15,'Model :',modelName,'='*15)

                    # Get parameters
                    epss = model_params.get('eps')
                    min_samples = model_params.get('min_samples')
                    metrics = model_params.get('metric')

                    for eps in epss:
                        for min_sample in min_samples:
                            for metric in metrics:

                                # Make model and fit_predict
                                model = DBSCAN(eps=eps, min_samples=min_sample, metric=metric)
                                predict = pd.DataFrame(model.fit_predict(scaled_df))
                                predict.columns = ['predict']
                                r = pd.concat([scaled_df,predict],axis=1)
                                print('min_samples =',min_sample, '/ eps =',eps, '/ metric =',metric, "Done.")

                                # Plotting with pairplot
                                plt.clf()
                                sns.pairplot(r,hue='predict')
                                plt.show()

                                # Compare predictions with origianl target_label
                                compareWithOriginalLabels(target_label,predict)

                elif modelName == 'Mean shift':
                    print('='*15,'Model :',modelName,'='*15)
                    # Find optimal parameter(bandwidth) using estimate_bandwidth
                    bandwidth = estimate_bandwidth(scaled_df)
                    print('best bandwidth =',bandwidth)

                    # Get parameters
                    max_iters = model_params.get('max_iter')
                    bandwidths = model_params.get('bandwidth')

                    for max_iter in max_iters:
                        for bandwidth in bandwidths:

                            # Make model and fit_predict
                            model = MeanShift(max_iter=max_iter, bandwidth=bandwidth)
                            predict = pd.DataFrame(model.fit_predict(scaled_df))
                            predict.columns = ['predict']
                            r = pd.concat([scaled_df,predict],axis=1)
                            print('max_iter =',max_iter, '/ bandwidth =',bandwidth, "Done.")

                            # Plotting with pairplot
                            plt.clf()
                            sns.pairplot(r,hue='predict')
                            plt.show()

                            # Compare predictions with origianl target_label
                            compareWithOriginalLabels(target_label,predict)
                else:
                    print('[Error]: Wrong modelName :',modelName)


# Main

In [None]:
# load dataset
df = pd.read_csv('housing.csv')

# Data Exploration & Preprocessing -------------------
print('='*15,'<Original Dataset>','='*15)
print(df.info(), end='\n\n')
display(df)

# Drop median_house_value column
target_label = df['median_house_value']
df = df.drop(['median_house_value'], axis=1)

# Drop dirty data in total_bedrooms
df = df.dropna()
df = df.reset_index(drop=True)

# Check modified dataset
print('='*15,'<Modified Dataset>','='*15)
print(df.info(), end='\n\n')
display(df)
# ----------------------------------------------------
# Setting parameters ---------------------------------
# encoder list
encoder_list = [
    preprocessing.LabelEncoder(), 
    preprocessing.OrdinalEncoder()
]
# scaler list
scaler_list = [
    preprocessing.StandardScaler(), 
    preprocessing.MinMaxScaler(), 
    preprocessing.RobustScaler(), 
    preprocessing.MaxAbsScaler(), 
    preprocessing.Normalizer()
]
# model list and parameters
model_list_and_params = {
    'KMeans':{
        'max_iter':[100,300,500],
        'algorithm':['full','elkan']
    },
    'GMM':{
        'covariance_type':['full', 'tied', 'diag'],
        'init_params':['kmeans', 'random']
    },
    'CLARANS':{
        'number_clusters':[2],
        'numlocal':[5],
        'maxneighbor':[8]
    },
    'DBSCAN':{
        'eps':[0.3,1,1.5],
        'min_samples':[100,200],
        'metric':['euclidean', 'manhattan']
    },
    'Mean shift':{
        'max_iter':[100,300,500],
        'bandwidth':[2,3,4]
    }
}

# 1. Full dataset, full parameters
AutoML(
    dataset=df, 
    target_label=target_label,
    encoder_list=encoder_list, 
    scaler_list=scaler_list,
    model_list_and_params = model_list_and_params,
)
# ----------------------------------------------------

# Setting parameters (reduced) -----------------------
# encoder list
encoder_list = [
    preprocessing.LabelEncoder(), 
    # preprocessing.OrdinalEncoder()
]
# scaler list
scaler_list = [
    preprocessing.StandardScaler(), 
    # preprocessing.MinMaxScaler(), 
    # preprocessing.RobustScaler(), 
    # preprocessing.MaxAbsScaler(), 
    # preprocessing.Normalizer()
]
# model list and parameters
model_list_and_params = {
    'KMeans':{
        'max_iter':[300],
        'algorithm':['elkan']
    },
    'GMM':{
        'covariance_type':['tied'],
        'init_params':['random']
    },
    'CLARANS':{
        'number_clusters':[2],
        'numlocal':[5],
        'maxneighbor':[8]
    },
    'DBSCAN':{
        'eps':[1],
        'min_samples':[200],
        'metric':['euclidean']
    },
    'Mean shift':{
        'max_iter':[300],
        'bandwidth':[3]
    }
}
# # 2. Full dataset, reduced parameters
AutoML(
    dataset=df, 
    target_label=target_label,
    encoder_list=encoder_list, 
    scaler_list=scaler_list,
    model_list_and_params = model_list_and_params,
)

# # Data Preprocessing ----------------------------
# # Drop 'longitude','latitude','housing_median_age','median_income' columns
df = df.drop(['longitude','latitude','housing_median_age','median_income'], axis=1)

# Check modified dataset
print('='*15,'<Modified Dataset>','='*15)
print(df.info(), end='\n\n')
display(df)

# Setting parameters (full) -----------------------
# encoder list
encoder_list = [
    preprocessing.LabelEncoder(), 
    preprocessing.OrdinalEncoder()
]
# scaler list
scaler_list = [
    preprocessing.StandardScaler(), 
    preprocessing.MinMaxScaler(), 
    preprocessing.RobustScaler(), 
    preprocessing.MaxAbsScaler(), 
    preprocessing.Normalizer()
]
# model list and parameters
model_list_and_params = {
    'KMeans':{
        'max_iter':[100,300,500],
        'algorithm':['full','elkan']
    },
    'GMM':{
        'covariance_type':['full', 'tied', 'diag'],
        'init_params':['kmeans', 'random']
    },
    'CLARANS':{
        'number_clusters':[2],
        'numlocal':[5],
        'maxneighbor':[8]
    },
    'DBSCAN':{
        'eps':[0.3,1,1.5],
        'min_samples':[100,200],
        'metric':['euclidean', 'manhattan']
    },
    'Mean shift':{
        'max_iter':[100,300,500],
        'bandwidth':[2,3,4]
    }
}
# 3. Reduced dataset, full parameters
AutoML(
    dataset=df, 
    target_label=target_label,
    encoder_list=encoder_list, 
    scaler_list=scaler_list,
    model_list_and_params = model_list_and_params,
)

# Setting parameters (reduced) -----------------------
# encoder list
encoder_list = [
    preprocessing.LabelEncoder(), 
    # preprocessing.OrdinalEncoder()
]
# scaler list
scaler_list = [
    preprocessing.StandardScaler(), 
    # preprocessing.MinMaxScaler(), 
    # preprocessing.RobustScaler(), 
    # preprocessing.MaxAbsScaler(), 
    # preprocessing.Normalizer()
]
# model list and parameters
model_list_and_params = {
    'KMeans':{
        'max_iter':[300],
        'algorithm':['elkan']
    },
    'GMM':{
        'covariance_type':['tied'],
        'init_params':['random']
    },
    'CLARANS':{
        'number_clusters':[2],
        'numlocal':[5],
        'maxneighbor':[8]
    },
    'DBSCAN':{
        'eps':[0.5],
        'min_samples':[300],
        'metric':['euclidean']
    },
    'Mean shift':{
        'max_iter':[300],
        'bandwidth':[2]
    }
}
# 4. Reduced dataset, reduced parameters
AutoML(
    dataset=df, 
    target_label=target_label,
    encoder_list=encoder_list, 
    scaler_list=scaler_list,
    model_list_and_params = model_list_and_params
)