# 01 Clustering With Python

## Preliminaries

## System

In [None]:
pwd

In [None]:
cd ..

In [None]:
ls

In [None]:
cd ..

In [None]:
ls

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install pandas matplotlib seaborn plotly scikit-learn

In [None]:
# !wget https://gist.github.com/AlexandreGazagnes/72d654c3bf2aa0a4e172d456b6a3de40

## Imports

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
import numpy as np

import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_iris

from scipy.cluster.hierarchy import dendrogram, linkage

## Get the data

In [None]:
# url = https://gist.github.com/AlexandreGazagnes/72d654c3bf2aa0a4e172d456b6a3de40"
# df = pd.read_csv(url)
# df.head()


In [None]:
# or

# data = load_iris()
# df = pd.DataFrame(data.data, columns=data.feature_names)
# df["Species"] = data.target
# df.head()

In [None]:
# or

fn = "./data/source/Iris.csv"
df = pd.read_csv(fn)
df.head()

## Data Exploration

### Display

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sample(10)

In [None]:
df.sample(20)

### Structure

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes(include='object').head()

In [None]:
df.select_dtypes(include=int).nunique()

In [None]:
df.select_dtypes(include=float).nunique()

### Select data

In [None]:
df.columns

### NaN

In [None]:
df.isna().head()

In [None]:
df.isna().sum()

### Data Inspection

In [None]:
import plotly.express as px


fig = px.pie(
    df,
    "Species",
    title="Data Distribution",
    template="plotly",
)

fig.show()

In [None]:
df.describe()

In [None]:
df.describe().round(2)

In [None]:
corr = df.select_dtypes(include="number").corr()
corr.round(4)

In [None]:
sns.heatmap(corr, annot=True)

In [None]:

sns.heatmap(corr, annot=True, cmap="coolwarm", fmt='.4f', vmin=0, vmax=1)

In [None]:

mask = np.triu(corr)
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".4f", vmin=0, vmax=1, mask=mask)

### Visualisation

In [None]:
sns.boxplot(data=df.drop(columns="Id"))

In [None]:
fig = px.box(
    data_frame=df,
    x="Species",
    y="SepalLengthCm",
    color="Species",
    orientation="v",
)
fig.show()

In [None]:
fig = px.box(
    data_frame=df,
    x="Species",
    y="PetalLengthCm",
    color="Species",
    orientation="v",
)
fig.show()

In [None]:
fig = px.box(
    data_frame=df,
    x="Species",
    y="SepalWidthCm",
    color="Species",
    orientation="v",
)
fig.show()

In [None]:
fig = px.box(
    data_frame=df,
    x="Species",
    y="PetalWidthCm",
    color="Species",
    orientation="v",
)
fig.show()

In [None]:
sns.pairplot(df.drop(columns="Id"), hue="Species")

## Clustering

## Scale

In [None]:
df

In [None]:
X  = df.drop(columns=["Id", "Species"])
X

In [None]:
scaler =  StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=df.columns[1:5])
X_scaled.head()

In [None]:
X_scaled.describe().round(2)

In [None]:
X_scaled = (X - X.mean()) / X.std()
X_scaled.head()

In [None]:
X_scaled.describe().round(2)

## Kmeans

In [None]:
kmeans = KMeans(n_clusters=5)

In [None]:
kmeans.fit(X_scaled)
labels = kmeans.predict(X_scaled)
labels

In [None]:
df.Species

In [None]:
# WCSS

inertia_list = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X_scaled)
    print(k, kmeans.inertia_)
    inertia_list.append(kmeans.inertia_)

In [None]:
plt.plot(range(2, 11), inertia_list, marker="o")

In [None]:
kmeans = KMeans(n_clusters=3, init="k-means++", n_init=10, max_iter=300, random_state=42)

In [None]:
kmeans.fit(X_scaled)
labels = kmeans.predict(X_scaled)

In [None]:
labels

In [None]:
df.Species

In [None]:
labels

In [None]:
impute_dict  = {1: "Iris-setosa", 0: "Iris-versicolor", 2: "Iris-virginica"}

In [None]:
labels = [impute_dict[label] for label in labels]
kmeans_preditcted = pd.Series(labels, name="Predicted")
kmeans_preditcted

In [None]:
pd.crosstab(df.Species, kmeans_preditcted)

## Agglomerative Clustering (Hierarchical Clustering)

In [None]:
agc = AgglomerativeClustering(n_clusters=3)
agc.fit(X_scaled)

In [None]:
agc_class = agc.labels_
agc_class[:100]

In [None]:
df.Species[:100].values

In [None]:
data_dict = {1: "Iris-setosa", 2: "Iris-versicolor", 0: "Iris-virginica"}

In [None]:
agc_predicted = [data_dict[label] for label in agc_class]
agc_predicted = pd.Series(agc_predicted, name="Predicted")
agc_predicted

In [None]:
pd.crosstab(df.Species, agc_predicted)

In [None]:

plt.figure(figsize=(10, 5))
plt.xlabel("sample index")
plt.ylabel("distance")
z = linkage(X_scaled, method="ward")
dendrogram(
    z,
    leaf_rotation=90,
    p=5,
    color_threshold=10,
    leaf_font_size=10,
    truncate_mode="level",
)
plt.tight_layout()