## Prerequisites

In [33]:
import os

# data manipulation
import numpy as np
import pandas as pd

# preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# machine learning models
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, AgglomerativeClustering, MeanShift, AffinityPropagation
from sklearn.mixture import GaussianMixture

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# evaluation metrics
from tabulate import tabulate
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

## Data Ingestion

In [34]:
os.chdir('..')
os.chdir('data')
dataframe= pd.read_csv('marketing_campaign.csv', sep= '\t')
os.chdir('..')
os.chdir('notebooks')

## Data Transformation

### Data Handling

#### Feature Separation

In [35]:
dataframe.drop('ID', axis= 1, inplace= True)

#### Performing Feature Engineering on Dt_Customer

In [36]:
dataframe.Dt_Customer= pd.to_datetime(dataframe.Dt_Customer, format= '%d-%m-%Y')

day_component= dataframe.Dt_Customer.dt.day
month_component= dataframe.Dt_Customer.dt.month
year_component= dataframe.Dt_Customer.dt.year

dataframe.insert(loc= 8, column= 'Dt_Customer_D', value= day_component)
dataframe.insert(loc= 9, column= 'Dt_Customer_M', value= month_component)
dataframe.insert(loc= 10, column= 'Dt_Customer_Y', value= year_component)

dataframe.drop('Dt_Customer', axis= 1, inplace= True)

In [37]:
N_columns= dataframe.select_dtypes(exclude= 'O').columns 
C_columns= dataframe.select_dtypes(include= 'O').columns

#### Imputing NaN value in Income column

In [38]:
imputer= SimpleImputer(strategy= 'median')
dataframe[N_columns]= imputer.fit_transform(dataframe[N_columns])

#### Transforming Separated Features

In [39]:
N_transformer= StandardScaler()
C_transformer= OneHotEncoder()

In [40]:
preprocessor= ColumnTransformer([('OneHotEncoder', C_transformer, C_columns),
                                 ('StandardScaler', N_transformer, N_columns)])

In [41]:
scaled_dataframe= preprocessor.fit_transform(dataframe)

## Model Selection

### Evaluation Metrics

In [49]:
def evaluate_model(model_name, scaled_dataframe, cluster_labels):
    # if model_name not in ['DBSCAN', 'Gaussian Mixture Model']:
    SIL= silhouette_score(scaled_dataframe, cluster_labels)
    CHS= calinski_harabasz_score(scaled_dataframe, cluster_labels)
    table= [['SIL', SIL], 
                ['CHS', CHS]]
    # else:
        # DBS= davies_bouldin_score(scaled_dataframe, cluster_labels)
        # table= [['DBS', DBS]]
    
    evaluation= tabulate(table, 
                         headers= ['METRIC', 'SCORE'], 
                         tablefmt= 'grid')
    return evaluation

In [50]:
def train_and_evaluate_model(model_object, model_name, scaled_dataframe):
    cluster_labels= model_object.fit_predict(scaled_dataframe)
    model_report= evaluate_model(model_name, scaled_dataframe, cluster_labels)
    print(f'{model_name}\n{model_report}')

In [51]:
clustering_models= {'KMeans': KMeans(),
                    'Agglomerative Clustering': AgglomerativeClustering(),
                    # 'DBSCAN': DBSCAN(),
                    'MeanShift': MeanShift(),
                    'Affinity Propagation': AffinityPropagation(),
                    # 'Gaussian Mixture Model': GaussianMixture()
                   }                   

In [52]:
for model_name, model_object in clustering_models.items():
    train_and_evaluate_model(model_object, model_name, scaled_dataframe)

KMeans
+----------+------------+
| METRIC   |      SCORE |
| SIL      |   0.111105 |
+----------+------------+
| CHS      | 165.025    |
+----------+------------+
Agglomerative Clustering
+----------+------------+
| METRIC   |      SCORE |
| SIL      |   0.188186 |
+----------+------------+
| CHS      | 430.983    |
+----------+------------+
MeanShift
+----------+-----------+
| METRIC   |     SCORE |
| SIL      |  0.193383 |
+----------+-----------+
| CHS      | 27.723    |
+----------+-----------+
Affinity Propagation
+----------+------------+
| METRIC   |      SCORE |
| SIL      |  0.0814834 |
+----------+------------+
| CHS      | 31.4999    |
+----------+------------+




Currently, Agglomerative Clustering is working the best, as it has highest CHS and good SIL.