In [20]:
import pandas as pd
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import DistanceMetric

In [21]:
# Read in Dataframe
survey_df = pd.read_excel("kawasaki_engine_naming_survey_08122020.xlsx"
                       ,sheet_name="bxmGLT")

In [56]:
# Define grouping columns 
# Easies way: Rearange the orders of the columns a
grouping_df=survey_df.iloc[:,1:17]
clustering_df=survey_df.iloc[:,18:-6]
clustering_df = clustering_df.fillna(0)
# If there's numerical columns. List in a list

In [9]:
# Complete all the data manipulation steps


In [14]:
def perform_kmeans(X, n_clust):
    """
    This function handles K means clustering. 
    """
    model = KMeans(n_clust)
    model.fit(X)
    cluster_labels = model.predict(X)
    cent = model.cluster_centers_
    return (cluster_labels, cent)


In [27]:
def plot_hist_with_cluster(df,cluster_num,targeting_cols,is_percent,cluster_col,width=875, height=250):
    hist = "percent" if is_percent else ""
    for i in targeting_cols:
        fig = px.histogram(df, x=i,facet_col=cluster_col,histnorm=hist,
           category_orders={cluster_col:[i for i in range(cluster_num+1)]},
                 width=width, height=height)
        fig.update_layout(
            font=dict(
                family="Courier New, monospace",
                size=7,
                color="RebeccaPurple"
            )
        )
        fig.show()


In [26]:
def calculate_WSS(points, kmax):
    """
    Calculate the Within-Cluster-Sum of Squared Errors (WSS) 
    for different values of k, and choose the k for which WSS 
    becomes first starts to diminish. In the plot of WSS-versus-k
    , this is visible as an elbow.
    """
    sse = []
    x_axis = []
    for k in range(2, kmax+1):
        x_axis.append(k)
        kmeans = KMeans(n_clusters = k).fit(points)
        centroids = kmeans.cluster_centers_
        pred_clusters = kmeans.predict(points)
        curr_sse = 0
    
    # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
        for i in range(len(points)):
            curr_center = centroids[pred_clusters[i]]
            curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
        sse.append(curr_sse)
    return x_axis,sse

def calculate_sill(points,kmax):
    sil = []
    x_axis = []
# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in range(2, kmax+1):
        x_axis.append(k)
        kmeans = KMeans(n_clusters = k).fit(points)
        labels = kmeans.labels_
        sil.append(silhouette_score(points, labels, metric = 'euclidean'))
    return x_axis,sil

def calculate_sill_ag(distance_matrix,kmax):
    sil = []
    x_axis = []
# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in range(2, kmax+1):
        x_axis.append(k)
        model = AgglomerativeClustering(affinity='precomputed', n_clusters=k, linkage='complete').fit(distance_matrix)
        labels = model.labels_
        sil.append(silhouette_score(distance_matrix, labels, metric = 'euclidean'))
    return x_axis,sil

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

https://datascience.stackexchange.com/questions/8681/clustering-for-mixed-numeric-and-nominal-discrete-data

Gower with Agglomertive


Onehot encoding

In [58]:
def gower_distance(X,numeric_cols=[]):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []
    epsilon = 10**(-8)
    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.columns[0] not in numeric_cols:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / max(np.ptp(feature.values), epsilon)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

In [61]:
print(clustering_df.shape)

(334, 28)


In [62]:
# Optional step to determine what's the best combination of columns
import itertools
from collections import Counter

result_dict={}
# for L in range(8, len(all_cols)+1):
for L in range(26, 29):
    temp_dict = {}
    temp_max = 0.0
    for subset in itertools.combinations(clustering_df.columns, L):
        distance_matrix = gower_distance(clustering_df[list(subset)],numeric_cols)
        distance_matrix[np.isnan(distance_matrix)] = 0
        x_axis,sil = calculate_sill_ag(distance_matrix,5)
        max_sil = max(sil)
        max_cluster = sil.index(max_sil)+2
        if max_sil > temp_max:
            temp_max = max_sil
            print(temp_max)
            temp_dict[subset] = [max_sil,max_cluster]
        else:
            continue
    result_dict[L] = temp_dict
    print("-----Done with {}".format(L))

0.2114400149124195
0.30430683086169386
0.3188229674640173
0.3198506610610669
0.3261737812034936
0.32642170872102244


KeyboardInterrupt: 

In [63]:
# Make the distance matrix for clustering
distance_matrix = gower_distance(clustering_df)
distance_matrix[np.isnan(distance_matrix)] = 0

In [64]:
# Plot the sill score to determine the number of clusters
x_axis,sil = calculate_sill_ag(distance_matrix,10)
px.line(x=x_axis,y=sil)

In [436]:
# For plotting the results
k=2
model = AgglomerativeClustering(affinity='precomputed', n_clusters=k, linkage='complete').fit(distance_matrix)
survey_df["ag_cluster"] = model.labels_
plot_hist_with_cluster(action_df,2,action_cols,False,"ag_cluster")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [572]:
import numpy as np

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering

def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)