# Clustering Code

### Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN, MeanShift
from sklearn.metrics import silhouette_score
import random
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
random.seed(42)

### Read the Dataset

In [None]:
df = pd.read_csv(r"C:\Users\gabri\Downloads\extended_real_estate_chatbot_dataset.csv")

### Gather Necessary Information About the Dataset

In [None]:
df.head()

In [None]:
df.describe(include="all")

In [None]:
df.info()

### Convert Time Variables to datetime

In [None]:
# Transform datetime to a datetime data format
df['Chat Open Time'] = pd.to_datetime(df['Chat Open Time'])
df['Chat Close Time'] = pd.to_datetime(df['Chat Close Time'])

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.columns

### Perform Feature Engineering for Potentially Important Columns

In [None]:
#Adding additional columns
df['total_questions_per_minute'] = df['Total Questions Asked'] / (df['Chat Duration (minutes)'])
df['total_pages_visited_per_minute'] = df['Total Pages Visited'] / (df['Chat Duration (minutes)'])

In [None]:
df.head()

### Change the Categorical Location data to have Numerical Representation

In [None]:
df['Desired Location'].unique()

In [None]:
location_mapping = {'Etobicoke':1, 'Downtown':2, 'Stouffville':3, 'Scarborough':4, 'Mississauga':5, 'Vaughan':6, 'Brampton':7, 'Oakville':8, 'Markham':9, 'North York':10}
df['Desired Location'] = df['Desired Location'].map(location_mapping)

In [None]:
df.head()

In [None]:
df.info()

### Copy the Dataframe

In [None]:
df2 = df.copy()

### Remove Datatypes that can't be Used in Clustering

In [None]:
df2 = df2.select_dtypes(exclude=['datetime64[ns]'])
df2.drop(columns=['Session ID'], inplace=True)

In [None]:
df2.head()

### Transform the Numerical Data

In [None]:
numerical_features = df2.columns.drop(['Left Contact Info', 'Desired Location'])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)])

In [None]:
df2[numerical_features] = preprocessor.fit_transform(df2[numerical_features])

In [None]:
df2

### Create Another Dataframe to Remove Categorical data to see Difference

In [None]:
df3 = df2.copy()

In [None]:
df3.drop(columns=['Left Contact Info', 'Desired Location'], inplace=True)

In [None]:
df3

### Density Clustering

In [None]:
# Applying DBSCAN
dbscan = DBSCAN(eps=3, min_samples=3)
dbscan_labels = dbscan.fit_predict(df2)

# Silhouette Score
# Note: DBSCAN can result in noise data point with label -1
# Only include points that are part of a cluster
if len(set(dbscan_labels)) > 1:
    dbscan_sil_score = silhouette_score(df2, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {dbscan_sil_score}")

# Plotting
plt.scatter(df2.iloc[:, 0], df2.iloc[:, 5], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 6], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 1], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[1])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 2], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[2])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 5], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 6], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 5], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 6], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 5], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 6], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 5], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 6], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[6])
plt.show()

In [None]:
df3

In [None]:
# Applying DBSCAN
dbscan = DBSCAN(eps=3, min_samples=3)
dbscan_labels = dbscan.fit_predict(df3)

# Silhouette Score
# Note: DBSCAN can result in noise data point with label -1
# Only include points that are part of a cluster
if len(set(dbscan_labels)) > 1:
    dbscan_sil_score = silhouette_score(df3, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {dbscan_sil_score}")

# Plotting
plt.scatter(df3.iloc[:, 0], df3.iloc[:, 3], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 4], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 1], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[1])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 2], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[2])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 3], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 4], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 3], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 4], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 3], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 4], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 3], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 4], c=dbscan_labels)
plt.title('DBSCAN Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[4])
plt.show()

### Dendrogram for Hierarchical Clustering

In [None]:
# Generate linkage matrix
Z = linkage(df2, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 5))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

In [None]:
# Generate linkage matrix
Z = linkage(df3, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 5))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

### Agglomerative Clustering

In [None]:
agglomerative_clustering = AgglomerativeClustering()
a_clustering_labels = agglomerative_clustering.fit_predict(df2)

# Silhouette Score
a_c_sil_score = silhouette_score(df2, a_clustering_labels)
print(f"Agglomerative Clustering Silhouette Score: {a_c_sil_score}")

# Plotting
plt.scatter(df2.iloc[:, 0], df2.iloc[:, 5], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 6], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 1], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[1])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 2], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[2])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 5], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 6], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 5], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 6], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 5], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 6], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 5], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 6], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[6])
plt.show()

In [None]:
agglomerative_clustering = AgglomerativeClustering()
a_clustering_labels = agglomerative_clustering.fit_predict(df3)

# Silhouette Score
a_c_sil_score = silhouette_score(df3, a_clustering_labels)
print(f"Agglomerative Clustering Silhouette Score: {a_c_sil_score}")

# Plotting
plt.scatter(df3.iloc[:, 0], df3.iloc[:, 3], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 4], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 1], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[1])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 2], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[2])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 3], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 4], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 3], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 4], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 3], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 4], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 3], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 4], c=a_clustering_labels)
plt.title('Agglomerative Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[4])
plt.show()

### Mean-Shift Clustering

In [None]:
mean_shift_clustering = MeanShift(bandwidth=3)
clustering_labels = mean_shift_clustering.fit_predict(df2)

# Silhouette Score
ms_sil_score = silhouette_score(df2, clustering_labels)
print(f"Mean-Shift Silhouette Score: {ms_sil_score}")

# Plotting
plt.scatter(df2.iloc[:, 0], df2.iloc[:, 5], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 6], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 1], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[1])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 2], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[2])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 5], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 6], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 5], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 6], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 5], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 6], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 5], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 6], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[6])
plt.show()

In [None]:
mean_shift_clustering = MeanShift(bandwidth=4)
clustering_labels = mean_shift_clustering.fit_predict(df3)

# Silhouette Score
ms_sil_score = silhouette_score(df3, clustering_labels)
print(f"Mean-Shift Silhouette Score: {ms_sil_score}")

# Plotting
plt.scatter(df3.iloc[:, 0], df3.iloc[:, 3], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 4], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 1], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[1])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 2], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[2])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 3], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 4], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 3], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 4], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 3], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 4], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 3], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 4], c=clustering_labels)
plt.title('Mean-Shift Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[4])
plt.show()

### K-means Clustering

In [None]:
inertia = []
silhouette_avg = []

for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(df2)
    inertia.append(kmeans.inertia_)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_

    if i>1:
        silhouette_avg = silhouette_avg + [silhouette_score(df2, labels)]

# Silhouette Score
plt.plot(range(1, 15), inertia)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
 
plt.plot(range(2, 15), silhouette_avg)
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
K = 2
kmeans = KMeans(n_clusters=K, random_state=0).fit(df2)

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(df2)
labels = kmeans.labels_
centers = kmeans.cluster_centers_

# Silhouette Score
sil_score = silhouette_score(df2, labels)
print(f"Silhouette Score: {sil_score}")

# Plotting
plt.scatter(df2.iloc[:, 0], df2.iloc[:, 5], c=labels)
plt.scatter(centers[:, 0], centers[:, 5], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 6], c=labels)
plt.scatter(centers[:, 0], centers[:, 6], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 1], c=labels)
plt.scatter(centers[:, 0], centers[:, 1], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[1])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 2], c=labels)
plt.scatter(centers[:, 0], centers[:, 2], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[2])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 5], c=labels)
plt.scatter(centers[:, 1], centers[:, 5], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 6], c=labels)
plt.scatter(centers[:, 1], centers[:, 6], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 5], c=labels)
plt.scatter(centers[:, 2], centers[:, 5], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 6], c=labels)
plt.scatter(centers[:, 2], centers[:, 6], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 5], c=labels)
plt.scatter(centers[:, 7], centers[:, 5], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 6], c=labels)
plt.scatter(centers[:, 7], centers[:, 6], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 5], c=labels)
plt.scatter(centers[:, 8], centers[:, 5], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 6], c=labels)
plt.scatter(centers[:, 8], centers[:, 6], c='red')
plt.title('K-means Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[6])
plt.show()

In [None]:
inertia = []
silhouette_avg = []

for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(df3)
    inertia.append(kmeans.inertia_)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_

    if i>1:
        silhouette_avg = silhouette_avg + [silhouette_score(df3, labels)]

# Silhouette Score
plt.plot(range(1, 15), inertia)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
 
plt.plot(range(2, 15), silhouette_avg)
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
K = 6
kmeans = KMeans(n_clusters=6, random_state=0).fit(df3)

In [None]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(df3)
labels = kmeans.labels_
centers = kmeans.cluster_centers_

# Silhouette Score
sil_score = silhouette_score(df3, labels)
print(f"Silhouette Score: {sil_score}")

# Plotting
plt.scatter(df3.iloc[:, 0], df3.iloc[:, 3], c=labels)
plt.scatter(centers[:, 0], centers[:, 3], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 4], c=labels)
plt.scatter(centers[:, 0], centers[:, 4], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 1], c=labels)
plt.scatter(centers[:, 0], centers[:, 1], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[1])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 2], c=labels)
plt.scatter(centers[:, 0], centers[:, 2], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[2])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 3], c=labels)
plt.scatter(centers[:, 1], centers[:, 3], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 4], c=labels)
plt.scatter(centers[:, 1], centers[:, 4], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 3], c=labels)
plt.scatter(centers[:, 2], centers[:, 3], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 4], c=labels)
plt.scatter(centers[:, 2], centers[:, 4], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 3], c=labels)
plt.scatter(centers[:, 5], centers[:, 3], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 4], c=labels)
plt.scatter(centers[:, 5], centers[:, 4], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 3], c=labels)
plt.scatter(centers[:, 6], centers[:, 3], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 4], c=labels)
plt.scatter(centers[:, 6], centers[:, 4], c='red')
plt.title('K-means Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[4])
plt.show()

### Affinity Propagation Clustering

In [None]:
af = AffinityPropagation(random_state=0)
af_labels = af.fit_predict(df2)

# Silhouette Score
af_sil_score = silhouette_score(df2, af_labels)
print(f"Affinity Propagation Silhouette Score: {af_sil_score}")

# Plotting
plt.scatter(df2.iloc[:, 0], df2.iloc[:, 5], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 6], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 1], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[1])
plt.show()

plt.scatter(df2.iloc[:, 0], df2.iloc[:, 2], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[0])
plt.ylabel(df2.columns[2])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 5], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 1], df2.iloc[:, 6], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[1])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 5], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 2], df2.iloc[:, 6], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[2])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 5], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 7], df2.iloc[:, 6], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[7])
plt.ylabel(df2.columns[6])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 5], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[5])
plt.show()

plt.scatter(df2.iloc[:, 8], df2.iloc[:, 6], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df2.columns[8])
plt.ylabel(df2.columns[6])
plt.show()

In [None]:
af = AffinityPropagation(random_state=0)
af_labels = af.fit_predict(df3)

# Silhouette Score
af_sil_score = silhouette_score(df3, af_labels)
print(f"Affinity Propagation Silhouette Score: {af_sil_score}")

# Plotting
plt.scatter(df3.iloc[:, 0], df3.iloc[:, 3], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 4], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 1], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[1])
plt.show()

plt.scatter(df3.iloc[:, 0], df3.iloc[:, 2], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[0])
plt.ylabel(df3.columns[2])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 3], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 1], df3.iloc[:, 4], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[1])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 3], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 2], df3.iloc[:, 4], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[2])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 3], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 5], df3.iloc[:, 4], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[5])
plt.ylabel(df3.columns[4])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 3], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[3])
plt.show()

plt.scatter(df3.iloc[:, 6], df3.iloc[:, 4], c=af_labels)
plt.title('Affinity Propagation Clustering')
plt.xlabel(df3.columns[6])
plt.ylabel(df3.columns[4])
plt.show()