# Clustering:

General imports & installs

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from biokit.viz import corrplot
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

ModuleNotFoundError: No module named 'biokit'

In [None]:
# pip install plotly==4.1.0

In [2]:
pip install biokit

Collecting biokit
  Downloading biokit-0.5.0.tar.gz (162 kB)
Collecting easydev>=0.9.34
  Downloading easydev-0.12.0.tar.gz (47 kB)
Collecting bioservices>=1.4.5
  Downloading bioservices-1.9.0.tar.gz (199 kB)
Collecting colormap
  Downloading colormap-1.0.4.tar.gz (17 kB)
Collecting biopython
  Downloading biopython-1.79-cp39-cp39-win_amd64.whl (2.3 MB)
Collecting numpydoc
  Downloading numpydoc-1.4.0-py3-none-any.whl (51 kB)
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting grequests
  Downloading grequests-0.6.0-py3-none-any.whl (5.2 kB)
Collecting requests_cache
  Downloading requests_cache-0.9.4-py3-none-any.whl (47 kB)
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting suds-community
  Downloading suds_community-1.1.1-py3-none-any.whl (144 kB)
Collecting appdirs
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting wrapt
  Downloading wrapt-1.14.1-cp39-cp39-win_amd64.whl (35 kB)
Colle

  Building wheel for colormap (setup.py): started
  Building wheel for colormap (setup.py): finished with status 'done'
  Created wheel for colormap: filename=colormap-1.0.4-py3-none-any.whl size=15637 sha256=d29e1be8f970fbd21595c0da00717b2ecd860544199bec1793ca80a7450aa399
  Stored in directory: c:\users\guilh\appdata\local\pip\cache\wheels\85\d0\c0\3ac513b77d6a3d9fdc8e5252f1a5abbcfd8f33a4f6dd8d5de4
Successfully built biokit bioservices easydev colormap
Installing collected packages: zope.interface, zope.event, zipp, ptyprocess, url-normalize, sphinxcontrib-serializinghtml, sphinxcontrib-qthelp, sphinxcontrib-jsmath, sphinxcontrib-htmlhelp, sphinxcontrib-devhelp, sphinxcontrib-applehelp, snowballstemmer, pexpect, importlib-metadata, imagesize, gevent, docutils, colorlog, cattrs, babel, appdirs, alabaster, xmltodict, wrapt, suds-community, sphinx, requests-cache, grequests, easydev, numpydoc, colormap, bioservices, biopython, biokit
Successfully installed alabaster-0.7.12 appdirs-1.4.4 

In [None]:
# pip install threadpoolctl

Loading and visualizing the data

In [None]:
data = pd.read_csv('data/Wine.csv')
print('The data has', data.shape[0], 'rows and',data.shape[1],'attributes.')
data.head()

## A little of EDA

Using the `describe` method to have a general overview of the descriptive statistics

In [None]:
data.describe().round(2)

Creating a correlation matrix with `.corr()` method

In [None]:
corr_matrix = data.corr().round(4)
corr_matrix

Creating a correlation graph using _biokit_ `corrplot` method

In [None]:
corr_graph = corrplot.Corrplot(corr_matrix)
corr_graph.plot(upper = 'ellipse', fontsize= 'x-large')
fig = plt.gcf()
fig.set_size_inches(12,8)
fig.show()

## Normalizing the Data

Since most ML models does not work properly with high variability in the data attributes, normalization tecniques can be used to acchieve less variable attributes that trough the same range of values (like from -1 to 1)

Mannual Normalziation by `MinMaxScaler`

In [None]:
attributes = data.columns
for attr in attributes:
    data[attr] = (data[attr] - min(data[attr])) / (max(data[attr]) - min(data[attr]))

In [None]:
# checking if all the data is varying from 0 to 1
data.head()

In [None]:
data.describe().round(2)

All the `max` values are set to 1 and `min` values set to 0

Normalizing the data with SK Learn

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
np_data = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_data, columns = attributes)
data.describe().round(2)

Using the _SKLearn_ we have the same output as our mannual method

## Introduction to K-means

K-means with 4 for clusters visualization

In [None]:
k_means = KMeans(n_clusters=4)
clustered_data = k_means.fit(data)
labels = clustered_data.labels_
print(labels)

In [None]:
# printing in a graph with plotly 2D
fig = go.Figure()
fig.add_trace(go.Scatter(x = data['Color_Intensity'], y = data['Alcohol'], mode = 'markers' , marker = dict(color = labels.astype(float)), text= labels))
fig.update_layout(autosize=False, width=1000, height=500, margin=dict(l=50, r=50, b=50, t=50, pad=4), paper_bgcolor="darkgray",)
fig.show()

K-means with 3 for clusters visualization

In [None]:
k_means = KMeans(n_clusters=3)
clustered_data = k_means.fit(data)
labels = clustered_data.labels_
print(labels)

In [None]:
# printing in a graph with plotly 2D
fig = go.Figure()
fig.add_trace(go.Scatter(x = data['Color_Intensity'], y = data['Alcohol'], mode = 'markers' , marker = dict(color = labels.astype(float)), text= labels))
fig.update_layout(autosize=False, width=1000, height=500, margin=dict(l=50, r=50, b=50, t=50, pad=4), paper_bgcolor="darkgray",)
fig.show()

Printing the K-mean 3 in 3D

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = data['Color_Intensity'], y = data['Alcohol'],z = data['Proline'], mode = 'markers' , marker = dict(color = labels.astype(float)), text= labels))
fig.update_layout(autosize=False, width=1000, height=500, margin=dict(l=50, r=50, b=50, t=50, pad=4), paper_bgcolor="black", 
                  scene=dict(xaxis_title = 'Color Intensity', yaxis_title = 'Alcohol', zaxis_title = 'Proline'))
fig.show()

Adding the center from each cluster

In [None]:
center = pd.DataFrame(k_means.cluster_centers_, columns = attributes)
center

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = data['Color_Intensity'], y = data['Alcohol'],z = data['Proline'], mode = 'markers' , marker = dict(color = labels.astype(float)), text= labels))
# Addign center here
fig.add_trace(go.Scatter3d(x = center['Color_Intensity'], y = center['Alcohol'], z = center['Proline'], mode = 'markers', marker = {'color': 'green'}, text = [0, 1, 2]))

fig.update_layout(autosize=False, width=1000, height=700, margin=dict(l=50, r=50, b=50, t=50, pad=4), paper_bgcolor="black", 
                  scene=dict(xaxis_title = 'Color Intensity', yaxis_title = 'Alcohol', zaxis_title = 'Proline'))
fig.show()

## Introduction to DBSCAN

Density-Based Spatial Clustering Application with Noise

In [None]:
dbscan = DBSCAN(eps = 1.31, min_samples= 15, metric= 'manhattan')
dbscan_cluster = dbscan.fit(data)

In [None]:
dbscan_cluster.labels_

In [None]:
# checking the number of clusters (-1 is a classification of noise)
np.unique(dbscan_cluster.labels_)

>Key Ideas <br>
- The _DBSCAN_ works selecting a random point and checking the number of neighbour points
- The `min_samples` marks wether a sample will be considered a cluster or just a noise
- The `eps` parameter delemits the radial distance that model will use to check for near neighbour
- It's better than k-means when there is noise in the dataset or the clusters are on non-radial groups

### Varying some parameters in DBSCAN

Varying the `eps` parameter

In [None]:
# with a low 'eps' we can see that all samples are considered as noise
dbscan = DBSCAN(eps = 0.01, min_samples= 15, metric= 'manhattan')
dbscan_cluster = dbscan.fit(data)
dbscan_cluster.labels_

In [None]:
# with a high 'eps' we can see that all samples are considered only one cluster
dbscan = DBSCAN(eps = 10, min_samples= 15, metric= 'manhattan')
dbscan_cluster = dbscan.fit(data)
dbscan_cluster.labels_

Varying the `min_samples` parameter

In [None]:
# min samples of 30
dbscan = DBSCAN(eps = 1.5, min_samples= 30, metric= 'manhattan')
dbscan_cluster = dbscan.fit(data)
dbscan_cluster.labels_

In [None]:
# min samples of 80
dbscan = DBSCAN(eps = 1.5, min_samples= 80, metric= 'manhattan')
dbscan_cluster = dbscan.fit(data)
dbscan_cluster.labels_

We can see that, the higher the `min_samples` the harder it is for the model to create a cluster. So there is more data classified as noise