In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Clustering Algorithms

## Table of Contents

* Making up data (code needs to be run to, but don't worry about the code here)
* [Intro Data EDA](#Intro-Data-EDA)
    * `TODO: write high level notes here`
    
    
* [KMeans](#KMeans)
    * `TODO: write high level notes here`
    
    
* [kmedoids](#kmedoids)
    * `TODO: write high level notes here`
    
    
* [AgglomerativeClustering](#AgglomerativeClustering)
    * `TODO: write high level notes here`
    
    
* [DBSCAN](#DBSCAN)
    * `TODO: write high level notes here`
    
    
* [GaussianMixture](#GaussianMixture)
    * `TODO: write high level notes here`
    
    
* [MeanShift](#MeanShift)
    * `TODO: write high level notes here`
    
    
* [AffinityPropagation](#AffinityPropagation)
    * `TODO: write high level notes here`
    
    

In [2]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import pdist, squareform

# !pip install pyclustering
from pyclustering.cluster.kmedoids import kmedoids

from sklearn.cluster import (
    KMeans,
    DBSCAN,
    AgglomerativeClustering,
    AffinityPropagation,
    MeanShift,
)
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs
from sklearn.preprocessing import scale

from gower import gower_matrix

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

<IPython.core.display.Javascript object>

In [4]:
def plot_cluster_results(X, labels):
    """Plot a 2d X (first 2 columns will be used) colored by label"""
    # Ensure X is a dataframe with 2 columns named x1 & x2
    df_x = pd.DataFrame(X)
    df_x = df_x.iloc[:, :2]
    df_x.columns = ["x1", "x2"]

    df_x["label"] = labels
    df_x = df_x.sort_values("label")
    # Make seaborn not convert the labels to numeric
    df_x["label"] = "'" + df_x["label"].astype(str) + "'"

    fig, axes = plt.subplots(1, 2, figsize=(10, 5))

    sns.scatterplot("x1", "x2", data=df_x, ax=axes[0])
    sns.scatterplot("x1", "x2", hue="label", data=df_x, ax=axes[1])
    axes[0].set_title("input data")
    axes[1].set_title("labeled data")
    plt.tight_layout()

<IPython.core.display.Javascript object>

## Making up example data

[back to top](#Clustering-Algorithms)

* `df_1` - numbers only
* `df_2` - numbers only
* `df_3` - yes/no survey style data
* `df_4` - nominal style data with all features having >3 category levels
* `df_5` - `df_1` + `df_3` + `df_4`

Click link to scroll on down to [Intro Data EDA](#Intro-Data-EDA).

----

In [5]:
n = 5000

<IPython.core.display.Javascript object>

Make up some purely numeric data

In [6]:
def make_df_1_2():
    np.random.seed(4)
    X1, _ = make_blobs(n, centers=5)

    np.random.seed(1)
    X2, _ = make_blobs(n, centers=5)

    df_1 = pd.DataFrame(scale(X1), columns=["x1", "x2"])
    df_2 = pd.DataFrame(scale(X2), columns=["x1", "x2"])

    return df_1, df_2

<IPython.core.display.Javascript object>

Make up some binary data

In [7]:
def make_df_3():
    np.random.seed(42)

    df_3 = pd.DataFrame()
    for i in range(5):
        col_name = f"q{i + 1}"
        df_3[col_name] = np.random.choice([0, 1], size=n)

    row_filter = df_3["q1"] == 1
    df_3.loc[row_filter, "q5"] = np.random.choice(
        [0, 1], size=sum(row_filter), p=[0.9, 0.1]
    )

    row_filter = df_3["q2"] == 1
    df_3.loc[row_filter, "q4"] = np.random.choice(
        [0, 1], size=sum(row_filter), p=[0.1, 0.9]
    )

    return df_3

<IPython.core.display.Javascript object>

Make up some categorical data

In [8]:
def make_df_4():
    np.random.seed(42)

    continents = [
        "Africa",
        "Asia",
        "Europe",
        "South America",
    ]

    dept = ["Accounting", "Sales", "IT", "Marketing"]
    companies = ["Innovato", "Rop", "UpDog"]

    df_4 = pd.DataFrame(
        {
            "continent": np.random.choice(continents, size=n),
            "dept": np.random.choice(dept, size=n),
        }
    )
    df_4["company"] = np.nan

    continent_filter = df_4["continent"].isin(continents[:2])
    df_4.loc[continent_filter, "company"] = np.random.choice(
        companies, p=[0.5, 0.3, 0.2], size=sum(continent_filter)
    )

    continent_filter = df_4["continent"].isin(continents[2:])
    df_4.loc[continent_filter, "company"] = np.random.choice(
        companies, p=[0.2, 0.3, 0.5], size=sum(continent_filter)
    )

    return df_4

<IPython.core.display.Javascript object>

Combine made up data to have mixed datatypes.

In [9]:
def make_df_5():
    df_1, _ = make_df_1_2()
    df_3 = make_df_3()
    df_4 = make_df_4()
    df_5 = pd.concat((df_1, df_3, df_4), axis=1)

    return df_5

<IPython.core.display.Javascript object>

## Intro Data EDA

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1` and `df_2` each consist of 2 numeric features.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.scatterplot("x1", "x2", data=df_1, ax=axes[0])
sns.scatterplot("x1", "x2", data=df_2, ax=axes[1])
axes[0].set_title("df_1")
axes[1].set_title("df_2")
plt.tight_layout()
plt.show()

`df_3` is binary data representing True/False responses to a survey

In [None]:
# Co-occurance matrix

# Read as how many times the question in
# the column was True at the same time the
# row question was True

# The diagonal represents how many times the question
# was answered as True

# For example, we can see q4 was the question was
# likely answered as True

# For example, q2 and q4 were commonly answered as True together
# 2222 times (44% of responses) is the highest off-diagonal number

# For example, q1 and q5 were rarely answered as True together
# 246 times (5% of responses) is the lowest number

co_occur = df_3.T.dot(df_3)
co_occur_percent = co_occur / df_3.shape[0]

print("Co-occurances")
display(co_occur.style.background_gradient(axis=None))

print("Co-occurances as a percent of responses")
display(co_occur_percent.style.background_gradient(axis=None))

`df_4` is categorical data

In [None]:
# Again, co-occurances

# We see Innovato seems to be more associated with Africa and Asia
# We see UpDog is associated with Europe and South America

crosstab = pd.crosstab([df_4["company"], df_4["continent"]], df_4["dept"])
crosstab.style.background_gradient(axis=None)

`df_5` is a combination of `df_1`, `df_3`, and `df_4`.

In [None]:
df_5.head()

## KMeans

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
clst = KMeans(5)
clst.fit(df_1)

In [None]:
clst.cluster_centers_

In [None]:
labels = clst.labels_

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
clst = KMeans(5)
clst.fit(df_2)

In [None]:
clst.cluster_centers_

In [None]:
labels = clst.labels_

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

In [None]:
# Nope

`df_4`

In [None]:
df_4.head(3)

In [None]:
# Nope

`df_5`

In [None]:
df_5.head(3)

In [None]:
# Nope

## kmedoids

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
labels = _____

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
labels = _____

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

`df_4`

In [None]:
df_4.head(3)

`df_5`

In [None]:
df_5.head(3)

## AgglomerativeClustering

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
labels = _____

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
labels = _____

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

`df_4`

In [None]:
df_4.head(3)

`df_5`

In [None]:
df_5.head(3)

## DBSCAN

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
labels = _____

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
labels = _____

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

`df_4`

In [None]:
df_4.head(3)

`df_5`

In [None]:
df_5.head(3)

## GaussianMixture

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
labels = _____

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
labels = _____

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

`df_4`

In [None]:
df_4.head(3)

`df_5`

In [None]:
df_5.head(3)

## MeanShift

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
labels = _____

plot_cluster_results(df_1, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
labels = _____

plot_cluster_results(df_2, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

`df_4`

In [None]:
df_4.head(3)

`df_5`

In [None]:
df_5.head(3)

## AffinityPropagation

[back to top](#Clustering-Algorithms)

In [None]:
# reset data for section
df_1, df_2 = make_df_1_2()
df_3 = make_df_3()
df_4 = make_df_4()
df_5 = make_df_5()

`df_1`

In [None]:
df_1.head(3)

In [None]:
# Downsampling because its slow to fit to 5000 records
df_sample = df_1.sample(500, random_state=42)

clst = AffinityPropagation(random_state=42)
clst.fit(df_sample)

In [None]:
df_sample.iloc[clst.cluster_centers_indices_, :]

In [None]:
labels = clst.labels_

plot_cluster_results(df_sample, labels)
plt.legend().remove()
plt.show()

`df_2`

In [None]:
df_2.head(3)

In [None]:
# Downsampling because its slow to fit to 5000 records
df_sample = df_2.sample(500, random_state=42)

clst = AffinityPropagation(random_state=42)
clst.fit(df_sample)

In [None]:
df_sample.iloc[clst.cluster_centers_indices_, :]

In [None]:
labels = clst.labels_

plot_cluster_results(df_sample, labels)
plt.legend().remove()
plt.show()

`df_3`

In [None]:
df_3.head(3)

In [None]:
# Downsampling because its slow to fit to 5000 records
# Could not get algo to converge using a sample of 500
# Could not get algo to converge using a sample of 150
df_sample = df_3.sample(100, random_state=42)

dist_mat = squareform(pdist(df_sample, metric="hamming"))
affinity_mat = -dist_mat

clst = AffinityPropagation(
    affinity="precomputed",
    preference=affinity_mat.min() * 3,
    max_iter=10000,
    random_state=42,
)
clst.fit(affinity_mat)

In [None]:
df_sample.iloc[clst.cluster_centers_indices_, :]

In [None]:
labels = clst.labels_
uniq_labels = sorted(np.unique(labels))

for label in uniq_labels:
    df_i = df_sample[labels == label]

    print(f"Cluster {label} co-occuring True answers")
    co_occur = df_i.T.dot(df_i)
    display(co_occur.style.background_gradient())

`df_4`

In [None]:
df_4.head(3)

In [None]:
# Downsampling because its slow to fit to 5000 records
# Could not get algo to converge using a sample of 500
# Could not get algo to converge using a sample of 150
df_sample = df_4.sample(100, random_state=42)
df_sample_dummies = pd.get_dummies(df_sample)

dist_mat = squareform(pdist(df_sample_dummies, metric="dice"))
affinity_mat = -dist_mat

clst = AffinityPropagation(
    affinity="precomputed", preference=affinity_mat.min() * 4, random_state=42,
)
clst.fit(affinity_mat)

In [None]:
df_sample.iloc[clst.cluster_centers_indices_, :]

In [None]:
labels = clst.labels_
uniq_labels = sorted(np.unique(labels))

for label in uniq_labels:
    df_i = df_sample[labels == label]

    print(f"Cluster {label} co-occuring categories")
    crosstab = pd.crosstab([df_i["company"], df_i["continent"]], df_i["dept"])
    display(crosstab.style.background_gradient(axis=None))

`df_5`

In [None]:
df_5.head(3)

In [None]:
# Computing a distance matrix of 5000 * 5000 is a lot
df_sample = df_5.sample(500, random_state=42)
dist_mat = gower_matrix(df_sample)

affinity_mat = -dist_mat

clst = AffinityPropagation(
    affinity="precomputed",
    preference=affinity_mat.min() * 5,
    max_iter=5000,
    random_state=42,
)
clst.fit(affinity_mat)

In [None]:
df_sample.iloc[clst.cluster_centers_indices_, :]