In [26]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import RobustScaler
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

In [2]:
# Human measurements dataset preparation
df = pd.read_csv('bodies_dataset.csv')
df2 = pd.read_csv('cars.csv')

In [3]:
df.columns

Index(['bia_di', 'bii_di', 'bit_di', 'che_de', 'che_di', 'elb_di', 'wri_di',
       'kne_di', 'ank_di', 'sho_gi', 'che_gi', 'wai_gi', 'nav_gi', 'hip_gi',
       'thi_gi', 'bic_gi', 'for_gi', 'kne_gi', 'cal_gi', 'ank_gi', 'wri_gi',
       'age', 'wgt', 'hgt', 'sex'],
      dtype='object')

In [4]:
df2.columns

Index(['brand', 'km_driven', 'fuel', 'owner', 'selling_price'], dtype='object')

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [6]:
cl_df2 = df2[['km_driven', 'selling_price']]

## K-Means

In [7]:
# Defining optimal amount of clusters (human measurements df)
inertials = []
for i in range(1, 11):
    model = KMeans(n_clusters=i, max_iter=300, n_init='auto')
    model.fit(df)
    inertials.append(model.inertia_)
fig = px.line(x=range(1, 11), y=inertials, markers=True,
              title='Cluster inertials')
fig.show()

## ^ Suppose best cluster amount is 2

In [8]:
# Evaluating and observing results
model = KMeans(n_clusters=2, max_iter=300, n_init='auto')
model.fit(df)
centers = pd.DataFrame(data=model.cluster_centers_, columns=df.columns)
centers

Unnamed: 0,bia_di,bii_di,bit_di,che_de,che_di,elb_di,wri_di,kne_di,ank_di,sho_gi,...,bic_gi,for_gi,kne_gi,cal_gi,ank_gi,wri_gi,age,wgt,hgt,sex
0,36.885821,27.13806,31.004104,17.55597,26.09403,12.472388,9.931716,17.999627,13.096269,100.477612,...,27.961194,23.805224,34.752612,34.538806,21.060448,15.121642,27.869403,58.860821,165.642164,0.16791
1,40.970711,28.605858,33.075314,21.098745,30.08159,14.408787,11.227615,19.720084,14.723431,116.848954,...,34.767364,28.340167,37.829289,37.804603,23.387448,17.191632,32.774059,80.682427,177.312971,0.845188


In [9]:
# Defining optimal amount of clusters (cars df)
inertials = []
for i in range(1, 11):
    model = KMeans(n_clusters=i, max_iter=300, n_init='auto')
    model.fit(cl_df2)
    inertials.append(model.inertia_)
fig = px.line(x=range(1, 11), y=inertials, markers=True,
              title='Cluster inertials')
fig.show()

In [10]:
# Evaluating and observing results
model = KMeans(n_clusters=2, max_iter=300, n_init='auto')
model.fit(cl_df2)
centers = pd.DataFrame(data=model.cluster_centers_, columns=cl_df2.columns)
centers

Unnamed: 0,km_driven,selling_price
0,71867.843369,494945.5
1,22756.20059,3931395.0


## Hierarchical Clustering

In [11]:
hr_df = df[['che_di', 'hip_gi']]


In [16]:
hr_df = df[['che_di', 'hip_gi']]
fig = ff.create_dendrogram(hr_df.sample(100))
fig.update_layout(title='Hierarchical Dendrogram (humans df)',
                  width=800, height=800)
fig.show()

In [13]:
fig = ff.create_dendrogram(cl_df2.sample(20))
fig.update_layout(title='Hierarchical Dendrogram (Cars df)',
                  width=800, height=800)
fig.show()

## DBSCAN

In [14]:
model = DBSCAN(eps=3)
model.fit(df[['che_di', 'hip_gi']])
model.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [15]:
# Plotting results
db_df = df[['che_di', 'hip_gi']]
db_df['cluster'] = np.char.mod('%s', model.labels_) # Better convert numbers array to list
fig = px.scatter(db_df, x='che_di', y='hip_gi', color='cluster',
                 title='DBSCAN result (Humans DF)')
fig.show()

In [42]:
model = DBSCAN(eps=1)
scaled_df = pd.DataFrame(data=RobustScaler().fit_transform(cl_df2),
                         columns=cl_df2.columns)
model.fit(scaled_df)
model.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [44]:
# Plotting results
db_df = cl_df2
db_df['cluster'] = np.char.mod('%s', model.labels_) # Better convert numbers array to list
fig = px.scatter(db_df, x='km_driven', y='selling_price', color='cluster',
                 title='DBSCAN result (Cars DF)')
fig.show()