# Best Clustering Technique
We take Aggregate sum for each parameter of each clustering technique and applied TOPSIS to find the best technique.

In [1]:
import pandas as pd

In [10]:
df=pd.read_csv("best_clustering_technique.csv")
df

Unnamed: 0,Clustering Techniques,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,Kmeans,8.5537,1411312.0,35.1797
1,Agglomerative,8.2,1262895.0,38.5362
2,Density-Based Spatial,-9.7184,260.3716,28.2396


In [11]:
def topsis(data,weights,impacts):
    #Step 1
    #removing first column that's usually the name of models/criteria
    df=data.drop(data.columns[0],axis=1)
    
    #Step2
    #check for any categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    # if yes, Convert categorical columns to numeric using cat.codes
    df[categorical_columns] = df[categorical_columns].apply(lambda col: col.astype('category').cat.codes)
    
    #Step 3
    #Calculate root of square of sum for each column
    rss=[]
    for j in range(0,df.shape[1]):
        l=[]
        for i in range(0,df.shape[0]):
            l.append(df.iloc[i,j]**2)
        rss.append((sum(l))**0.5)
    
    #Step 4
    #Calculate normalized performance values
    for i in range(0,df.shape[1]):
        for j in range(0,df.shape[0]):
            df.iloc[j,i]=df.iloc[j,i]/rss[i]
    
    #Step 5
    #Calculate weighted normalised decision matrix
    for i in range(0,df.shape[1]):
        for j in range(0,df.shape[0]):
            df.iloc[j,i]=df.iloc[j,i]/weights[i]
    
    #Step 6
    #Extract ideal best and ideal worst for each column according to impact
    ideal_best=[]
    ideal_worst=[]
    for i in range(0,df.shape[1]):
            if impacts[i] == '+':
                ideal_best.append(max(df.iloc[:,i]))
                ideal_worst.append(min(df.iloc[:,i]))
            elif impacts[i] == '-':
                ideal_best.append(min(df.iloc[:,i]))
                ideal_worst.append(max(df.iloc[:,i]))
    
    #Step 7
    #Calculate euclidean distance for both ideal best and ideal worst value
    Sp=[]
    Sn=[]
    for i in range(0,df.shape[0]):
        l1=[]
        l2=[]
        for j in range(0,df.shape[1]):
            l1.append((df.iloc[i,j]-ideal_best[j])**2)
            l2.append((df.iloc[i,j]-ideal_worst[j])**2)
        Sp.append(sum(l1)**0.5)
        Sn.append(sum(l2)**0.5)
    
    #Step 8
    #Calculate performance score
    p=[]
    for i in range(0,len(Sp)):
        p.append(Sn[i]/(Sn[i]+Sp[i]))
    
    #Step 9
    #Create DataFrame
    df_new=pd.DataFrame(data)
    df_new['Performance Score']=p
    
    #Step 10
    #Calculate Rank based on performance score and add it to the dataframe
    df_new['Rank'] = df_new['Performance Score'].rank(ascending=False)
    
    return df_new

In [12]:
weights=[1,1,1]
impacts=['+','+','-']

In [15]:
result=topsis(df,weights,impacts)
result

Unnamed: 0,Clustering Techniques,Silhouette,Calinski-Harabasz,Davies-Bouldin,Performance Score,Rank
0,Kmeans,8.5537,1411312.0,35.1797,0.92325,1.0
1,Agglomerative,8.2,1262895.0,38.5362,0.875265,2.0
2,Density-Based Spatial,-9.7184,260.3716,28.2396,0.109873,3.0


So ```K-means Clustering``` is the best choice.

In [16]:
result.to_csv("Best_Clustering_technique_topsis_result.csv")

# Best Number of Clusters
We take Average of each parameter for each cluster in every clustering technique and applied TOPSIS to determine best number of clusters.

In [18]:
df2=pd.read_csv("best_no_of_cluster.csv")
df2

Unnamed: 0,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,Three,0.703267,212447.2455,8.8226
1,Four,0.6345,228691.2564,8.526
2,Five,0.553133,225869.7633,8.350033
3,Six,0.4542,224480.7834,8.286533


In [19]:
weights=[1,1,1]
impacts=['+','+','-']

In [20]:
result2=topsis(df2,weights,impacts)
result2

Unnamed: 0,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin,Performance Score,Rank
0,Three,0.703267,212447.2455,8.8226,0.813226,1.0
1,Four,0.6345,228691.2564,8.526,0.724988,2.0
2,Five,0.553133,225869.7633,8.350033,0.422995,3.0
3,Six,0.4542,224480.7834,8.286533,0.16503,4.0


So ```Three clusters``` are the best choice.

In [21]:
result2.to_csv("best_no_of_cluster_topsis_result.csv")