# **Step 1 : Importing Libraries**
> We need the following libraries for the workflow:
> 
> 1. umicelrepo : fetch data from uci
> 2. pandas: data manipulation
> 3. numpy: mathematical operations
> 4. pycaret: analyzing results of various models


In [27]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from pycaret.clustering import *
from pycaret.datasets import get_data

# **Step 2 : Importing the required Dataset**
Here we are importing student perforance datset that predicts student performance in secondary education (high school).

In [28]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
student_performance = fetch_ucirepo(id=320) 
  
# data (as pandas dataframes) 
X = student_performance.data.features 
y = student_performance.data.targets 

In [29]:
# metadata 
print(student_performance.metadata) 

{'uci_id': 320, 'name': 'Student Performance', 'repository_url': 'https://archive.ics.uci.edu/dataset/320/student+performance', 'data_url': 'https://archive.ics.uci.edu/static/public/320/data.csv', 'abstract': 'Predict student performance in secondary education (high school). ', 'area': 'Social Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 649, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': ['Sex', 'Age', 'Other', 'Education Level', 'Occupation'], 'target_col': ['G1', 'G2', 'G3'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2008, 'last_updated': 'Fri Jan 05 2024', 'dataset_doi': '10.24432/C5TG7T', 'creators': ['Paulo Cortez'], 'intro_paper': {'ID': 360, 'type': 'NATIVE', 'title': 'Using data mining to predict secondary school student performance', 'authors': 'P. Cortez, A. M. G. Silva', 'venue': 'Proceedings of 5th Annual Future Business Technolo

In [30]:
# variable information 
print(student_performance.variables) 

          name     role         type      demographic  \
0       school  Feature  Categorical             None   
1          sex  Feature       Binary              Sex   
2          age  Feature      Integer              Age   
3      address  Feature  Categorical             None   
4      famsize  Feature  Categorical            Other   
5      Pstatus  Feature  Categorical            Other   
6         Medu  Feature      Integer  Education Level   
7         Fedu  Feature      Integer  Education Level   
8         Mjob  Feature  Categorical       Occupation   
9         Fjob  Feature  Categorical       Occupation   
10      reason  Feature  Categorical             None   
11    guardian  Feature  Categorical             None   
12  traveltime  Feature      Integer             None   
13   studytime  Feature      Integer             None   
14    failures  Feature      Integer             None   
15   schoolsup  Feature       Binary             None   
16      famsup  Feature       B

In [31]:
# variable information 
print(student_performance.variables) 

          name     role         type      demographic  \
0       school  Feature  Categorical             None   
1          sex  Feature       Binary              Sex   
2          age  Feature      Integer              Age   
3      address  Feature  Categorical             None   
4      famsize  Feature  Categorical            Other   
5      Pstatus  Feature  Categorical            Other   
6         Medu  Feature      Integer  Education Level   
7         Fedu  Feature      Integer  Education Level   
8         Mjob  Feature  Categorical       Occupation   
9         Fjob  Feature  Categorical       Occupation   
10      reason  Feature  Categorical             None   
11    guardian  Feature  Categorical             None   
12  traveltime  Feature      Integer             None   
13   studytime  Feature      Integer             None   
14    failures  Feature      Integer             None   
15   schoolsup  Feature       Binary             None   
16      famsup  Feature       B

In [32]:

X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,4
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,2
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,6
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,0
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,0


In [33]:

y.head()

Unnamed: 0,G1,G2,G3
0,0,11,11
1,9,11,11
2,12,13,12
3,14,14,14
4,11,13,13


# **Step 3 : Converting Categorical Features to Numerical values**
The binary categorical columns are converted in the form of 0s and 1s


In [34]:
X["school"]=X['school'].apply(lambda x: {True:1,False:0}[x=="GP"])
X["sex"]=X["sex"].apply(lambda x: {True: 1, False:0}[x=="M"])
X["address"]=X["address"].apply(lambda x: {True: 1, False:0}[x=="U"])
X["famsize"]=X["famsize"].apply(lambda x: {True: 1, False:0}[x=="GT3"])
X["Pstatus"]=X["Pstatus"].apply(lambda x: {True: 1, False:0}[x=="A"])
X["schoolsup"]=X['schoolsup'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["famsup"]=X['famsup'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["paid"]=X['paid'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["activities"]=X['activities'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["nursery"]=X['nursery'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["higher"]=X['higher'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["internet"]=X['internet'].apply(lambda x: {True:1,False:0}[x=="yes"])
X["romantic"]=X['romantic'].apply(lambda x: {True:1,False:0}[x=="yes"])


The nominal categorical columns are processed using dummy encoding in order to convert them to numerical columns

In [35]:
import pandas as pd
from pandas.api.types import CategoricalDtype

reason_type = CategoricalDtype(categories=['other', 'home', 'reputation', 'course'], ordered=True)

# Convert the 'reason' column to the categorical type with the specified order
X['reason'] = X['reason'].astype(reason_type)

# Apply dummy encoding to 'reason' and drop the 'other' category
X = pd.get_dummies(X, columns=['reason'], drop_first=True, dtype=int)

# Print the updated dataset
print(X.head())

  school  sex  age  address  famsize  Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP    0   18        1        1        1     4     4  at_home   teacher   
1     GP    0   17        1        1        0     1     1  at_home     other   
2     GP    0   15        1        0        0     1     1  at_home     other   
3     GP    0   15        1        1        0     4     2   health  services   
4     GP    0   16        1        1        0     3     3    other     other   

   ... famrel  freetime  goout  Dalc  Walc  health  absences  reason_home  \
0  ...      4         3      4     1     1       3         4            0   
1  ...      5         3      3     1     1       3         2            0   
2  ...      4         3      2     2     3       3         6            0   
3  ...      3         2      2     1     1       5         0            1   
4  ...      4         3      2     1     2       5         0            1   

   reason_reputation  reason_course  
0                 

In [36]:
# Display the column names of the updated dataset X
print(X.columns)


Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'guardian', 'traveltime', 'studytime', 'failures',
       'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher',
       'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
       'health', 'absences', 'reason_home', 'reason_reputation',
       'reason_course'],
      dtype='object')


In [37]:
guardian_type = CategoricalDtype(categories=['other', 'mother', 'father'], ordered=True)

# Convert the 'guardian' column to the categorical type with the specified order
X['guardian'] = X['guardian'].astype(guardian_type)

# Apply dummy encoding to 'guardian' and drop the 'other' category
X = pd.get_dummies(X, columns=['guardian'], drop_first=True, dtype=int)



In [38]:
job_type = CategoricalDtype(categories=['other', 'teacher', 'health','services'], ordered=True)

# Convert the 'reason' column to the categorical type with the specified order
X['Mjob'] = X['Mjob'].astype(job_type)

# Apply dummy encoding to 'reason' and drop the 'other' category
X = pd.get_dummies(X, columns=['Mjob'], drop_first=True, dtype=int)



In [39]:

# Convert the 'reason' column to the categorical type with the specified order
X['Fjob'] = X['Fjob'].astype(job_type)

# Apply dummy encoding to 'reason' and drop the 'other' category
X = pd.get_dummies(X, columns=['Fjob'], drop_first=True, dtype=int)



In [40]:
print(X.columns)

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences',
       'reason_home', 'reason_reputation', 'reason_course', 'guardian_mother',
       'guardian_father', 'Mjob_teacher', 'Mjob_health', 'Mjob_services',
       'Fjob_teacher', 'Fjob_health', 'Fjob_services'],
      dtype='object')


# **Step 4: Using Pycaret**
Applying the different Clustering techniques with different preprocessing techniques on the dataset  to analyse the effectiveness of the models in different scenarios

In [45]:
# Initialize variables
rows = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']
Type = ['No Data Preprocessing', 'Using Normalization', 'Using Transform', 'Using PCA', 'Using T+N', 'Using T+N+PCA']
models = ['kmeans', 'hclust', 'meanshift']

# List of dictionaries containing the different arguments for setup function
setup_args = [
    {'verbose': False},
    {'normalize': True, 'normalize_method': 'zscore', 'verbose': False},
    {'transformation': True, 'transformation_method': 'yeo-johnson', 'verbose': False},
    {'pca': True, 'pca_method': 'linear', 'verbose': False},
    {'transformation': True, 'transformation_method': 'yeo-johnson', 'normalize': True, 'normalize_method': 'zscore', 'verbose': False},
    {'pca': True, 'pca_method': 'linear', 'normalize': True, 'normalize_method': 'zscore', 'transformation': True, 'transformation_method': 'yeo-johnson', 'verbose': False}
]

# Loop over models
# Loop over models
for k in models:
    data = {}  # Initialize the data dictionary for storing results for this model
    for j, setup_arg in enumerate(setup_args):
        for i in range(3):
            print(f"{k} with {Type[j]} and {i+3} clusters")

            # Setup and model creation
            ModelParameters = setup(data=X, **setup_arg)  # Ensure 'X' is properly preprocessed
            Model = create_model(k, num_clusters=i+3)
            metrics = get_metrics()  # Ensure this function returns the correct metrics structure

            # Validate that metrics contain expected rows and columns
            print("Metrics available:", metrics.index)

            # Check if there are at least 2 unique clusters before calculating the silhouette score
            unique_labels = set(Model.labels_)
            if len(unique_labels) < 2:
                print(f"Skipping silhouette score calculation for {k} with {Type[j]} and {i+3} clusters (only 1 cluster found)")
                continue  # Skip this iteration if there are less than 2 unique clusters

            # Extracting silhouette score, Calinski-Harabasz score, Davies-Bouldin score
            silhouette_score_function = metrics.loc['silhouette', 'Score Function']
            silhouette_score = silhouette_score_function(X, Model.labels_)

            chs_score_function = metrics.loc['chs', 'Score Function']
            Calinski_Harabasz_score = chs_score_function(X, Model.labels_)

            db_score_function = metrics.loc['db', 'Score Function']
            Davies_Bouldin_score = db_score_function(X, Model.labels_)

            # Store the metrics in the data dictionary
            data[(Type[j], f'c={i+3}')] = [silhouette_score, Calinski_Harabasz_score, Davies_Bouldin_score]

    # Store results in appropriate DataFrame based on model
    if k == 'kmeans':
        kmeans_metrics = pd.DataFrame(data=data, index=rows)
    elif k == 'hclust':
        hclust_metrics = pd.DataFrame(data=data, index=rows)
    else:
        meanshift_metrics = pd.DataFrame(data=data, index=rows)

    # Optionally print out the metrics for each model
    print(f"{k} Metrics:")
    print(pd.DataFrame(data=data, index=rows))



kmeans with No Data Preprocessing and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2091,272.5678,1.5616,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with No Data Preprocessing and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1626,219.3775,1.7813,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with No Data Preprocessing and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1719,173.9047,1.8366,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Normalization and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0485,34.2371,3.6015,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Normalization and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0525,31.6841,3.4156,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Normalization and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0632,29.9035,3.1304,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Transform and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4268,134245.5534,0.8383,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Transform and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4988,144218.9198,0.7082,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using Transform and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5109,138704.7639,0.7008,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2091,272.5678,1.5616,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1626,219.3775,1.7813,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1106,182.4184,2.1252,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0467,33.7475,3.5866,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0475,31.2032,3.4356,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0395,26.3295,3.5117,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N+PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.064,33.1121,3.3433,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N+PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0477,31.4222,3.5349,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans with Using T+N+PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0542,28.3764,3.2991,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
kmeans Metrics:
                  No Data Preprocessing                          \
                                    c=3         c=4         c=5   
Silhouette                     0.209082    0.162554    0.171860   
Calinski-Harabasz            272.567827  219.377509  173.904653   
Davies-Bouldin                 1.561627    1.781254    1.836583   

                  Using Normalization                       Using Transform  \
                                  c=3        c=4        c=5             c=3   
Silhouette                   0.033764   0.005435  -0.035688       -0.010140   
Calinski-Harabasz           34.721304  15.899657  10.711338        8.435235   
Davies-Bouldin               3.672416   4.848572   5.250046        7.490364   

                                        Using PCA                          \
                        c=4       c=5         c=3         c=4         c=5  

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2253,249.8886,1.4329,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with No Data Preprocessing and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.118,193.366,1.9266,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with No Data Preprocessing and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0995,163.4075,2.1523,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Normalization and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0366,26.4166,4.3943,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Normalization and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0469,25.4194,3.6548,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Normalization and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0496,23.9035,3.3135,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Transform and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4666,164605.5586,0.6405,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Transform and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4988,144218.9198,0.7082,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using Transform and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5033,139781.3619,0.6978,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2962,233.407,1.2577,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1224,195.9503,2.0143,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1132,165.2781,2.0681,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0378,26.3996,4.1929,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0477,25.4067,3.5085,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.049,25.1058,3.3844,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N+PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0378,26.3996,4.1929,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N+PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0477,25.4067,3.5085,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust with Using T+N+PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.049,25.1058,3.3844,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
hclust Metrics:
                  No Data Preprocessing                          \
                                    c=3         c=4         c=5   
Silhouette                     0.225318    0.117983    0.099503   
Calinski-Harabasz            249.888575  193.366024  163.407477   
Davies-Bouldin                 1.432917    1.926597    2.152307   

                  Using Normalization                     Using Transform  \
                                  c=3       c=4       c=5             c=3   
Silhouette                  -0.014050 -0.037549 -0.043147        0.001166   
Calinski-Harabasz           12.130232  8.629229  6.971735       10.628826   
Davies-Bouldin               6.268182  6.051944  6.097963        6.016656   

                                        Using PCA                          \
                        c=4       c=5         c=3         c=4         c=5   
Silhouet

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with No Data Preprocessing and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with No Data Preprocessing and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Normalization and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.243,2.46,0.6274,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Normalization and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.243,2.46,0.6274,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Normalization and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.243,2.46,0.6274,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Transform and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.978,212544.9706,0.0324,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Transform and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.978,212544.9706,0.0324,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using Transform and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.978,212544.9706,0.0324,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4691,138.6204,0.7725,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
meanshift with Using T+N and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N and 3 clusters (only 1 cluster found)
meanshift with Using T+N and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N and 4 clusters (only 1 cluster found)
meanshift with Using T+N and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N and 5 clusters (only 1 cluster found)
meanshift with Using T+N+PCA and 3 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N+PCA and 3 clusters (only 1 cluster found)
meanshift with Using T+N+PCA and 4 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N+PCA and 4 clusters (only 1 cluster found)
meanshift with Using T+N+PCA and 5 clusters


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Metrics available: Index(['silhouette', 'chs', 'db', 'hs', 'ari', 'cs'], dtype='object', name='ID')
Skipping silhouette score calculation for meanshift with Using T+N+PCA and 5 clusters (only 1 cluster found)
meanshift Metrics:
                  No Data Preprocessing                          \
                                    c=3         c=4         c=5   
Silhouette                     0.469081    0.469081    0.469081   
Calinski-Harabasz            138.620436  138.620436  138.620436   
Davies-Bouldin                 0.772487    0.772487    0.772487   

                  Using Normalization                     Using Transform  \
                                  c=3       c=4       c=5             c=3   
Silhouette                   0.097380  0.097380  0.097380        0.120314   
Calinski-Harabasz            1.251107  1.251107  1.251107       13.465677   
Davies-Bouldin               0.811828  0.811828  0.811828        4.202556   

                                          Using PC

# **Step 5: Analysing the calculated metrics**

In [46]:
kmeans_metrics

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.209082,0.162554,0.17186,0.033764,0.005435,-0.035688,-0.01014,-0.010619,-0.022878,0.209082,0.162554,0.110563,0.032636,0.002127,-0.023383,0.025603,0.005163,-0.005683
Calinski-Harabasz,272.567827,219.377509,173.904653,34.721304,15.899657,10.711338,8.435235,8.091245,8.438423,272.567827,219.377509,182.418376,33.90499,18.574704,22.494553,13.935831,14.425908,12.086487
Davies-Bouldin,1.561627,1.781254,1.836583,3.672416,4.848572,5.250046,7.490364,6.973082,7.599781,1.561627,1.781254,2.125219,3.680329,4.335931,3.847651,5.392075,4.837525,5.655301


In [47]:
hclust_metrics

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA,Using T+N,Using T+N,Using T+N,Using T+N+PCA,Using T+N+PCA,Using T+N+PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.225318,0.117983,0.099503,-0.01405,-0.037549,-0.043147,0.001166,-0.010619,-0.016562,0.296192,0.122439,0.113228,-0.009469,-0.033155,-0.048531,-0.009469,-0.033155,-0.048531
Calinski-Harabasz,249.888575,193.366024,163.407477,12.130232,8.629229,6.971735,10.628826,8.091245,8.030579,233.406957,195.950298,165.278057,13.103972,9.271203,9.102733,13.103972,9.271203,9.102733
Davies-Bouldin,1.432917,1.926597,2.152307,6.268182,6.051944,6.097963,6.016656,6.973082,7.492652,1.257722,2.014328,2.068139,5.649897,5.568101,5.564344,5.649897,5.568101,5.564344


In [48]:
meanshift_metrics

Unnamed: 0_level_0,No Data Preprocessing,No Data Preprocessing,No Data Preprocessing,Using Normalization,Using Normalization,Using Normalization,Using Transform,Using Transform,Using Transform,Using PCA,Using PCA,Using PCA
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.469081,0.469081,0.469081,0.09738,0.09738,0.09738,0.120314,0.120314,0.120314,0.469081,0.469081,0.469081
Calinski-Harabasz,138.620436,138.620436,138.620436,1.251107,1.251107,1.251107,13.465677,13.465677,13.465677,138.620436,138.620436,138.620436
Davies-Bouldin,0.772487,0.772487,0.772487,0.811828,0.811828,0.811828,4.202556,4.202556,4.202556,0.772487,0.772487,0.772487


Hence, upon analysing the above metrics we can clearly see the performance of diffrent processing techniques in different cases on the same dataset along with the impact of varying number of clusters. 
After Analysing, we comprehend that Mean shift is the best algorithm amongst all.