In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from scipy.cluster.hierarchy import linkage,cophenet,dendrogram ,cut_tree
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
warnings.filterwarnings('ignore')
from scipy.spatial.distance import pdist

In [1]:
df_segmentation = pd.read_csv('channels.csv')
del df_segmentation['channel_id']  
df_segmentation = df_segmentation.fillna(0)
df_segmentation.head(100)

In [2]:
scaler = StandardScaler()
df_segmentation_scaled = scaler.fit_transform(df_segmentation)
X_scaled_df = pd.DataFrame(df_segmentation_scaled,columns=df_segmentation.columns)
df_segmentation.head(50)

# Principle Component Analysis

In [5]:
pca = PCA()
pca.fit(df_segmentation_scaled)
pca.explained_variance_ratio_
pca_final = IncrementalPCA(n_components=2)
X_pca_final = pca_final.fit_transform(df_segmentation_scaled)

# Optimal Number Of Clusters for KMeans

In [3]:
silhouette_scores = []
ch_scores = []
db_scores = []
for n_clusters in range(2, 11):
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X_pca_final)
    silhouette_avg = silhouette_score(X_pca_final, cluster_labels)
    ch_score = calinski_harabasz_score(X_pca_final, cluster_labels)
    db_score = davies_bouldin_score(X_pca_final, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    ch_scores.append(ch_score)
    db_scores.append(db_score)
best_n_clusters = silhouette_scores.index(max(silhouette_scores))
print("The optimal number of clusters is:", best_n_clusters)
clusterer = AgglomerativeClustering(n_clusters=best_n_clusters)
cluster_labels = clusterer.fit_predict(X_pca_final)

X_pca_final_df = pd.DataFrame(X_pca_final,columns=['PC1','PC2'])
X_pca_final_df['Cluster'] = cluster_labels

plt.figure(figsize=(14,6),dpi=60)
plt.subplot(1,2,1)
sns.scatterplot(x='PC1',y='PC2',data=X_pca_final_df,hue='Cluster')
plt.xlabel("PC1", fontsize=14)
plt.ylabel("PC2", fontsize=14)
plt.legend(fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
print("Silhouette Score:", silhouette_scores[best_n_clusters],"(Higher score indicates better separation)")
print("Davies-Bouldin Index:", db_scores[best_n_clusters],"(Lower score indicates better separation)")

In [4]:
kmeans = KMeans(n_clusters=best_n_clusters, max_iter=1000, random_state=42)
kmeans.fit(X_pca_final)
df_segmentation['K-Means_Cluster_ID'] = kmeans.labels_

# Optimal Number Of Clusters For Hierarchical Clustering (Silhouette Score)

In [5]:
silhouette_scores = []
for n_clusters in range(2, 11):
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage="single")
    cluster_labels = clusterer.fit_predict(X_pca_final)
    silhouette_avg = silhouette_score(X_pca_final, cluster_labels)
    silhouette_scores.append(silhouette_avg)

best_n_clusters_h = silhouette_scores.index(max(silhouette_scores))
print("The optimal number of clusters for hierarchical clustering is:", best_n_clusters_h)
print("Silhouette score for the optimal number of clusters:", max(silhouette_scores))

sl_mergings = linkage(X_pca_final, method="single", metric='euclidean')
dendrogram(sl_mergings)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Channel Index")
plt.ylabel("Distance")
plt.show()

c, coph_dists = cophenet(sl_mergings, pdist(X_pca_final))
print("Cophenetic correlation coefficient:", c)

In [6]:
sl_cluster_labels = cut_tree(sl_mergings, n_clusters=best_n_clusters_h).reshape(-1, )
df_segmentation["Hierarchical_Cluster_labels"] = sl_cluster_labels
df_segmentation

In [7]:
X_pca_final_df = pd.DataFrame(X_pca_final,columns=['PC1','PC2'])
X_pca_final_df['K_Means_Cluster'] = kmeans.labels_
X_pca_final_df['Hierarchical_Cluster'] = sl_cluster_labels
X_pca_final_df.head(50)