In [None]:
## correlation visualization improvement
cols = [list of columns]
corr = df[cols].corr()
corr.style.background_gradient(axis=None) # this line adds gradient based on values

In [3]:
# importing libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.utils.validation import check_is_fitted
from teaching_tools.widgets import ClusterWidget, SCFClusterWidget
from scipy.stats.mstats import trimmed_var
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [2]:
#pip install teaching-tools

### data download from here https://sda.berkeley.edu/sdaweb/docs/scfcomb2019/DOC/hcbkh01.htm
and then you can fetch it using read_csv and use the library

scfc = SCFClusterWidget(x=df["DEBT"], y=df["HOUSES"], n_clusters=3)
scfc.show()

for understanding how clustering works

In [None]:
# Build model
model = KMeans(n_clusters=3, random_state =42)
print("model type:", type(model))

# Fit model to data
model.fit(X)

# Assert that model has been fit to data
check_is_fitted(model)

In [None]:
labels = model.labels_
print("labels type:", type(labels))
print("labels shape:", labels.shape)
labels[:10]

In [None]:
sns.scatterplot(
    x = df['1st column'] ,
    y = df['second column'],
    hue = labels,
    palette = 'deep'
)
plt.xlabel()
plt.ylabel()
plt.title();

In [None]:
centroids = model.cluster_centers_
print("centroids type:", type(centroids))
print("centroids shape:", centroids.shape)
centroids

In [None]:
## Adding centroid to the plot
# Plot "HOUSES" vs "DEBT", add centroids
sns.scatterplot(
    x = df['1st column'],
    y = df['2nd column'],
    hue = labels,
    palette = 'deep'
)
plt.scatter(
    x = centroids[:, 0] / 1e6,
    y = centroids[:, 1] / 1e6,
    color = 'gray',
    marker = "*",
    s = 150
    
)
plt.xlabel()
plt.ylabel()
plt.title();

In [None]:
# to get the euclidean distance between data points and  centroid, to get the sum of L2 norm distance of all the points
inertia = model.inertia_
print("inertia type:", type(inertia))
print("Inertia (3 clusters):", inertia)

In [None]:
# silhouette score is a measure between [-1,1], between a cluster data points and non cluster data points fron a centroid
# i.e. b-a / max(a,b)
ss = silhouette_score(X, labels)
print("ss type:", type(ss))
print("Silhouette Score (3 clusters):", ss)

In [None]:
## finding optimal number of clusters
n_clusters = range(2,13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # build the model for k clusters
    model = KMeans(n_clusters = k, random_state =42)
    # train model
    model.fit(X)
    # calculate inertia
    inertia_errors.append(model.inertia_)
    #calculate silhuoette score
    silhouette_scores.append(silhouette_score(X, model.labels_))

print("inertia_errors type:", type(inertia_errors))
print("inertia_errors len:", len(inertia_errors))
print("Inertia:", inertia_errors)
print()
print("silhouette_scores type:", type(silhouette_scores))
print("silhouette_scores len:", len(silhouette_scores))
print("Silhouette Scores:", silhouette_scores)

In [None]:
# Plot `inertia_errors` by `n_clusters`
plt.plot(n_clusters, inertia_errors)
plt.xlabel("no. of Clusters (k)")
plt.ylabel("Inertia (L2 norm)")
plt.title("K-Means Model: Inertia vs Number of Clusters");

For intertia, best value is where you see this bent enbow, so the one where value starts flattening, min is better here

In [None]:
# Plot `silhouette_scores` vs `n_clusters`
plt.plot(n_clusters, silhouette_scores)
plt.xlabel("no. of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("K-Means Model: Silhouette Score vs Number of Clusters");

For silhouette score, higher score is better, so the one where drastic drop happens. comnbining both these plots and eye balling you canj find best value for clusters.

In [None]:
# getting the mean value of these clusers
cluster_means = X.groupby(model.labels_).mean()

print("cluster_means type:", type(xgb))
print("cluster_means shape:", xgb.shape)

# this is same as the one we get from cluster centers
model.cluster_centers_

In [None]:
# selecting set of feature based which ones have largest variance
top_var = df.apply(trimmed_var, limits = (0.1,0.1)).sort_values().tail(10)