In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

In [54]:
milk = pd.read_csv('../Datasets/milk.csv', index_col=0)
std = StandardScaler().set_output(transform='pandas')
milk_scl = std.fit_transform(milk)

In [55]:
cluster = DBSCAN(eps=0.6,min_samples=2)
cluster.fit(milk_scl)
cluster.labels_

array([ 0,  0,  0,  0, -1,  1,  1,  2, -1, -1,  2,  1,  0, -1,  1,  2, -1,
       -1, -1, -1,  3,  3, -1, -1, -1])

-1 mean they are outliers

In [56]:
milk_copy = milk.copy()
milk_copy['cluster'] = cluster.labels_ 
milk_copy.sort_values('cluster')


Unnamed: 0_level_0,water,protein,fat,lactose,ash,cluster
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HIPPO,90.4,0.6,4.5,4.4,0.1,-1
PIG,82.8,7.1,5.1,3.7,1.1,-1
CAT,81.6,10.1,6.3,4.4,0.75,-1
GUINEA PIG,81.9,7.4,7.2,2.7,0.85,-1
RAT,72.5,9.2,12.6,3.3,1.4,-1
RABBIT,71.3,12.3,13.1,1.9,2.3,-1
ELEPHANT,70.7,3.6,17.6,5.6,0.63,-1
DOG,76.3,9.3,9.5,3.0,1.2,-1
WHALE,64.8,11.1,21.2,1.6,1.7,-1
DOLPHIN,44.9,10.6,34.9,0.9,0.53,-1


#### Evaluate score but before that remove the outliers as they are -1

In [57]:
inliers = milk_copy[milk_copy['cluster'] != -1]
inliers

silhouette_score(inliers.iloc[:,:-1], inliers['cluster'])

0.5828991621573324

In [58]:
inliers = milk_scl.copy()
inliers['label'] = cluster.labels_
inliers = inliers[inliers['label'] != -1]
inliers

Unnamed: 0_level_0,water,protein,fat,lactose,ash,label
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HORSE,0.948806,-1.009291,-0.903208,1.542217,-1.037554,0
ORANGUTAN,0.821407,-1.344603,-0.660619,1.040773,-1.259945,0
MONKEY,0.813445,-1.121062,-0.738247,1.263637,-1.381249,0
DONKEY,0.964731,-1.260775,-0.864394,1.152205,-0.936467,0
CAMEL,0.757707,-0.757806,-0.670322,0.372182,-0.30973,1
BISON,0.694008,-0.394551,-0.835283,0.873626,0.0744,1
BUFFALO,0.31181,-0.087181,-0.233662,0.316466,-0.168208,2
FOX,0.271998,0.108418,-0.427733,0.427898,0.135052,2
LLAMA,0.662158,-0.646036,-0.689729,0.81791,-0.127774,1
MULE,0.940844,-1.176947,-0.825579,0.762194,-0.794946,0


In [59]:
silhouette_score(inliers.iloc[:,:-1], inliers['label'])

0.5934459505692155

##### Why is condition required?
because if epsilon is small enough then all lables will be -1 and since we are removing all the -1 then we wont have any data to test score on

In [71]:
epsilion = (0.2,0.4,0.6,0.8,1,1.2)
min_p = (2,3,4,5)
scores = []
for e in epsilion:
    for p in min_p:
        cluster = DBSCAN(eps=e, min_samples=p)
        cluster.fit(milk_scl)
        inliers = milk_scl.copy()
        inliers['cluster'] = cluster.labels_
        inliers = inliers[inliers['cluster'] != -1]
        if len(set(inliers['cluster'].value_counts())) > 1:
            scores.append([e,p, silhouette_score(inliers.iloc[:,:-1], inliers['cluster'])])
        else:
            continue

scores = pd.DataFrame(scores, columns=['epsilon', 'min sample', 'score'])
scores.sort_values('score', ascending=False)

Unnamed: 0,epsilon,min sample,score
0,0.4,2,0.651894
9,1.0,3,0.647387
2,0.6,2,0.593446
10,1.2,2,0.552889
4,0.6,4,0.551975
1,0.4,3,0.538518
3,0.6,3,0.534443
6,0.8,3,0.533038
5,0.8,2,0.464674
7,0.8,4,0.457151


USING NP UNIQUE

In [69]:
epsilion = (0.2,0.4,0.6,0.8,1,1.2)
min_p = (2,3,4,5)
scores = []
for e in epsilion:
    for p in min_p:
        cluster = DBSCAN(eps=e, min_samples=p)
        cluster.fit(milk_scl)
        inliers = milk_scl.copy()
        inliers['cluster'] = cluster.labels_
        inliers = inliers[inliers['cluster'] != -1]
        if len(np.unique(inliers['cluster'])) > 1:
            scores.append([e,p, silhouette_score(inliers.iloc[:,:-1], inliers['cluster'])])
        else:
            continue

scores = pd.DataFrame(scores, columns=['epsilon', 'min sample', 'score'])
scores.sort_values('score', ascending=False)

Unnamed: 0,epsilon,min sample,score
0,0.4,2,0.651894
9,1.0,3,0.647387
2,0.6,2,0.593446
10,1.2,2,0.552889
4,0.6,4,0.551975
1,0.4,3,0.538518
3,0.6,3,0.534443
6,0.8,3,0.533038
5,0.8,2,0.464674
7,0.8,4,0.457151


## Usage with Supervised Learning
kyphosis data

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
# from sklearn.

In [80]:
df = pd.read_csv('..\\Datasets\\cases\\Kyphosis\\Kyphosis.csv')
y = df['Kyphosis']
X = df.drop('Kyphosis', axis=1)
train, test = train_test_split(df, test_size=0.3, random_state=25, stratify=y)
X_train, y_train = train.drop('Kyphosis', axis=1), train['Kyphosis']
X_test, y_test = test.drop('Kyphosis', axis=1), test['Kyphosis']

In [82]:
std = StandardScaler().set_output(transform='pandas')
from sklearn.cluster import KMeans

X_train_scl = std.fit_transform(X_train)
scores = []

for i in tqdm(range(2, 11)):
    clust = KMeans(random_state=25, n_clusters=i)
    clust.fit(X_train_scl)
    scores.append([i, silhouette_score(X_train_scl, clust.labels_)])

scores_df = pd.DataFrame(scores, columns=['n', 'score'])
scores_df.sort_values(by='score', ascending=False)

100%|██████████| 9/9 [00:00<00:00, 69.51it/s]


Unnamed: 0,n,score
2,4,0.396206
8,10,0.39334
7,9,0.366559
3,5,0.351667
0,2,0.348402
1,3,0.330521
6,8,0.323228
5,7,0.287655
4,6,0.285068


In [None]:
cluster = KMeans(random_state=25, n_clusters=4)
cluster.fit(X_train)
train['cluster'] = cluster.labels_


Unnamed: 0,Kyphosis,Age,Number,Start,cluster
63,absent,118,3,16,1
45,present,139,3,10,1
66,absent,195,2,17,2
75,absent,178,4,15,2
60,present,130,4,1,1
25,absent,9,5,13,0
21,present,105,6,5,3
17,absent,175,5,13,2
58,absent,51,7,9,0
0,absent,71,3,5,3
