# Basic Model

In [13]:
# sorting current working directory
import os
def working_directory():
    """fixed the issue

    Returns:
        str: the location of the ideal directory
    """
    return os.getcwd().replace("\\notebooks","")
os.chdir(working_directory())

## Libraries

In [2]:
#standard libraries
import os
import sys
import multiprocessing

#related third party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from pyclustertend import hopkins
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering
from sklearn.metrics import silhouette_score
import joblib

#Local application/library specific imports
from src.config import RAW_DATA_FILE, DATA_FOLDER, PROCESSED_DATA_FILE, MODEL_OUTPUT,PARAMETERS_OUTPUT

## Datasets

In [3]:
df = pd.read_csv(PROCESSED_DATA_FILE)
df

Unnamed: 0,gender,age,annual_income_(k$),spending_score_(1-100)
0,1,0.019231,0.000000,0.387755
1,1,0.057692,0.000000,0.816327
2,0,0.038462,0.008197,0.051020
3,0,0.096154,0.008197,0.775510
4,0,0.250000,0.016393,0.397959
...,...,...,...,...
195,0,0.326923,0.860656,0.795918
196,0,0.519231,0.909836,0.275510
197,1,0.269231,0.909836,0.744898
198,1,0.269231,1.000000,0.173469


## Data Exploration

#### What is the cluster tendency of the data

The method used is the **Hopkins test**, with the pyclustertend library implementation. The test has been used here to indicate the cluster tendency of the data by checking whether the data fits a uniformly random distribution.<!-- Might be good to go over if this is good way to check -->The results of the test give a score between 0 and 1. In this implementation of the Hopkins test, a score tending to zero indicates the data is not uniformly distributed, thus high cluster tendency; however a score that is too high can indicate no cluster tendency.

In [4]:
hopkins(df,df.shape[0])

0.2040219380454187

## Model functions 

In [7]:
class Models:
    def KMeansModel(X,n_clusters,centre :bool):
        """
        refactor later
        first edition: call model, predict model, save model, returns model prediction and optional model centres"""
        model = KMeans(n_clusters=n_clusters, init = 'k-means++')
        model.fit(X)
        kmeans_pred = model.predict(X)
        kmeans_centre = model.cluster_centers_
        joblib.dump(model,os.path.join(MODEL_OUTPUT,"kmeans_model.pkl"))
        if centre == True:
            return kmeans_pred, kmeans_centre
        if centre == False:
            return kmeans_pred

    def add_prediction_df(dataframe,cluster_labels,label: str):
        dataframe[label] = cluster_labels

    def agglomerativeClustering_model(data,n_clusters: int,distance_threshold = None,what_return : str ='predict'):
        model = AgglomerativeClustering(n_clusters=n_clusters, distance_threshold=distance_threshold)
        agg_clust_fit = model.fit(data)
        agg_clust_pred = model.fit_predict(data)
        if what_return == 'predict':
            return agg_clust_pred
        elif what_return == 'fit':
            return agg_clust_fit

    def affinity_prop_model(data):
        model = AffinityPropagation(damping=0.5,random_state=44)
        affinity_clust_pred = model.fit_predict(data)
        return affinity_clust_pred

    def spectral_clustering_model(data,n_clusters):
        model = SpectralClustering(n_clusters=n_clusters)
        spectral_clustering_pred = model.fit_predict(data)
        return spectral_clustering_pred


In [8]:
k_pred= Models.KMeansModel(df,4,centre=False)
ap_pred = Models.affinity_prop_model(df)
sc_pred = Models.spectral_clustering_model(data=df,n_clusters=4)
ac_pred = Models.agglomerativeClustering_model(data=df,n_clusters=4,distance_threshold=None,what_return='predict')

In [9]:
models = {
    'k_means' : k_pred,
    'affinity_prob' : ap_pred,
    'spectral_clustering' : sc_pred,
    'agglomerative_clustering' : ac_pred
}
df_2 = df.copy()
for model_name, model in models.items():
    Models.add_prediction_df(df_2,cluster_labels=model,label=model_name)


## Evaluation 

In [10]:
#number of clusters 
for column in df_2.iloc[:,4:]:
    print(f"{column}'s amount of unique values are: {df_2[column].nunique()}")

k_means's amount of unique values are: 4
affinity_prob's amount of unique values are: 9
spectral_clustering's amount of unique values are: 4
agglomerative_clustering's amount of unique values are: 4


In [11]:
# Average Within-cluster similarity score for all clusters
model_predictions = df_2.iloc[:,4:]
for _ in model_predictions:
    print(f"For {_}:\n the silhouette score is {silhouette_score(df_2,df_2[_].values)}")

For k_means:
 the silhouette score is 0.35705293413448674
For affinity_prob:
 the silhouette score is 0.7075817651279572
For spectral_clustering:
 the silhouette score is 0.2706135678828743
For agglomerative_clustering:
 the silhouette score is 0.36422432823566764


## Analysis of Clusters

In [12]:
for column in model_predictions:
    print(f"for prediction {column}\n")
    for x in range(df_2[column].nunique()):
        print(f"for group {x} there are {df_2[df_2[column] == x].shape[0]}")

for prediction k_means

for group 0 there are 57
for group 1 there are 48
for group 2 there are 55
for group 3 there are 40
for prediction affinity_prob

for group 0 there are 18
for group 1 there are 27
for group 2 there are 28
for group 3 there are 25
for group 4 there are 25
for group 5 there are 18
for group 6 there are 20
for group 7 there are 21
for group 8 there are 18
for prediction spectral_clustering

for group 0 there are 112
for group 1 there are 19
for group 2 there are 39
for group 3 there are 30
for prediction agglomerative_clustering

for group 0 there are 52
for group 1 there are 59
for group 2 there are 36
for group 3 there are 53
