In [1]:
#Attribute information [Source]:
#1. seismic: the result of shift seismic hazard assessment in the mine working obtained by the seismic method (a — lack of hazard, b — low hazard, c — high hazard, d — danger state);
#2. seismoacoustic: the result of shift seismic hazard assessment in the mine working obtained by the seismoacoustic method;
#3. shift: information about the type of a shift (W — coal-getting, N -preparation shift);
#4. genergy: seismic energy recorded within the previous shift by the most active geophone (GMax) out of
#geophones monitoring the longwall;
#5. gpuls: a number of pulses recorded within the previous shift by GMax;
#6. gdenergy: a deviation of energy recorded within the previous shift by GMax from average energy recorded during eight previous shifts;
#7. gdpuls: a deviation of a number of pulses recorded within the previous shift by GMax from the average number of pulses recorded during eight previous shifts;
#8. ghazard: the result of shift seismic hazard assessment in the mine working obtained by the seismoacoustic method based on registration coming to from GMax only;
#9. nbumps: the number of seismic bumps recorded within the previous shift;
#10. nbumps2: the number of seismic bumps (in energy range [1⁰²,1⁰³)) registered within the previous shift;
#11. nbumps3: the number of seismic bumps (in energy range [1⁰³,1⁰⁴)) registered within the previous shift;
#12. nbumps4: the number of seismic bumps (in energy range [1⁰⁴,1⁰⁵)) registered within the previous shift;
#13. nbumps5: the number of seismic bumps (in energy range [1⁰⁵,1⁰⁶)) registered within the last shift;
#14. nbumps6: the number of seismic bumps (in energy range [1⁰⁶,1⁰⁷)) registered within the previous shift;
#15. nbumps7: the number of seismic bumps (in energy range [1⁰⁷,1⁰⁸)) registered within the previous shift;
#16. nbumps89: the number of seismic bumps (in energy range [1⁰⁸,1⁰¹⁰)) registered within the previous shift;
#17. energy: the total energy of seismic bumps registered within the previous shift;
#18. maxenergy: the maximum energy of the seismic bumps registered within the previous shift;
#19. class: the decision attribute — ‘1’ means that high energy seismic bump occurred in the next shift (‘hazardous state’), ‘0’ means that no high energy seismic bumps occurred in the next shift (‘non-hazardous state’).#

In [None]:
!pip install tabulate

%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from tabulate import tabulate
from collections import defaultdict
from scipy.stats.stats import pearsonr

#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#kmeans, dbscan, hierarchical (sklearn)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
#evaluation
from sklearn.metrics import silhouette_score
#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform
# hierarchical (scipy)
from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
df = pd.read_csv('seismic-bumps.csv')
df.dtypes

In [None]:
df = df.convert_dtypes()
df.dtypes

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

## Data Preprocessing

### Categorizing columns in types of features and labels

In [None]:
label = 'class'
col_list_categorical = ['seismic', 'seismoacoustic', 'shift', 'hazard']
col_list_numerical = ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'energy', 'maxenergy']
col_list_discrete = ['nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89']
len([label]) + len(col_list_categorical) + len(col_list_discrete) + len(col_list_numerical)

In [None]:
df.info()

In [None]:
df['seismic'] = df['seismic'].astype("category")
df['seismoacoustic'] = df['seismoacoustic'].astype("category")
df['shift'] = df['shift'].astype("category")
df['genergy'] = df['genergy'].astype(float)
df['gpuls'] = df['gpuls'].astype(float)
df['gdenergy'] = df['gdenergy'].astype(float)
df['gdpuls'] = df['gdpuls'].astype(float)
df['hazard'] = df['hazard'].astype("category")
df['nbumps'] = df['nbumps'].astype(float)
df['nbumps2'] = df['nbumps2'].astype(float)
df['nbumps3'] = df['nbumps3'].astype(float)
df['nbumps4'] = df['nbumps4'].astype(float)
df['nbumps5'] = df['nbumps5'].astype(float)
df['nbumps6'] = df['nbumps6'].astype(float)
df['nbumps7'] = df['nbumps7'].astype(float)
df['nbumps89'] = df['nbumps89'].astype(float)
df['energy'] = df['energy'].astype(float)
df['maxenergy'] = df['maxenergy'].astype(float)
df['class'] = df['class'].astype(float)

df.dtypes

In [None]:
#Print frequency of categories
for col in col_list_categorical:
    print ('\nFrequency of Categories for variable %s'%col)
    print (df[col].value_counts())

# CATEGORICAL VARIABLES DISTRIBUTION ANALYSIS

In [None]:
for e in col_list_categorical:
    df[e].value_counts(normalize=True, ascending=True).plot(kind='barh', title=f'{e} values', color=['royalblue','lightgreen','coral'], figsize=(5,5))
    plt.show()

##  NUMERICAL VARIABLES DISTRIBUTION ANALYSIS

In [None]:
for e in col_list_numerical:
    df[e].value_counts(normalize=True).plot(kind='density', title=f'{e} values',  figsize=(5,5))
    plt.show()

#### as we can see, the variables above assume a right skewed distribution. This means that the majority of values are located in left part, around zero.

## DISCRETE VARIABLES DISTRIBUTION ANALYSIS

In [None]:
#overview of distributions
selected_columns = df[['nbumps','nbumps2','nbumps3','nbumps4', 'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89']]
df_discrete = selected_columns.copy()
df_discrete.hist(figsize=(20,15))

### as seen before with the histograms, nbumps6, nbumps7 and nbumps89 contain only zero, so they can be dropped

In [None]:
df.drop(columns=['nbumps6', 'nbumps7', 'nbumps89'], inplace=True)
col_list_discrete = ['nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5']
df.head()

## DECISION VARIABLE

In [None]:
#df['class'].value_counts(normalize=True, ascending=True).plot(kind='barh', title='class values', colors=['#BB0000', '#0000BB'])

sns.countplot(x=df['class'], palette={0:'royalblue',1:'lightcoral'} )

In [None]:
df['class'].value_counts()  #in numero

#### Contingency tables for categorical variables

In [None]:
data_crosstab = pd.crosstab(df['seismic'], df[label], colnames=['class'])
data_crosstab

In [None]:
print(f"bias for seismic a {data_crosstab[1][0]/data_crosstab[0][0]}")
print(f"bias for seismic b {data_crosstab[1][1]/data_crosstab[0][1]}")

print(f"ratio for seismic a {data_crosstab[0][0]/data_crosstab[1][0]}")
print(f"ratio for seismic b {data_crosstab[0][1]/data_crosstab[1][1]}")

In [None]:
data_crosstab = pd.crosstab(df['seismoacoustic'], df[label], colnames=['class'])
data_crosstab

In [None]:
print(f"bias for seismoacoustic a {data_crosstab[1][0]/data_crosstab[0][0]}")
print(f"bias for seismoacoustic b {data_crosstab[1][1]/data_crosstab[0][1]}")
print(f"bias for seismoacoustic c {data_crosstab[1][2]/data_crosstab[0][2]}")

print(f"ratio for seismoacoustic a {data_crosstab[0][0]/data_crosstab[1][0]}")
print(f"ratio for seismoacoustic b {data_crosstab[0][1]/data_crosstab[1][1]}")
print(f"ratio for seismoacoustic c {data_crosstab[0][2]/data_crosstab[1][2]}")

In [None]:
data_crosstab = pd.crosstab(df['shift'], df[label])
data_crosstab

In [None]:
print(f"bias for shift N {data_crosstab[1][0]/data_crosstab[0][0]}")
print(f"bias for shift W {data_crosstab[1][1]/data_crosstab[0][1]}")

print(f"ratio for shift N {data_crosstab[0][0]/data_crosstab[1][0]}")
print(f"ratio for shift W {data_crosstab[0][1]/data_crosstab[1][1]}")

In [None]:
data_crosstab = pd.crosstab(df['hazard'], df[label])
data_crosstab

In [None]:
print(f"bias for hazard a {data_crosstab[1][0]/data_crosstab[0][0]}")
print(f"bias for hazard b {data_crosstab[1][1]/data_crosstab[0][1]}")
print(f"bias for hazard c {data_crosstab[1][2]/data_crosstab[0][2]}")

print(f"ratio for hazard a {data_crosstab[0][0]/data_crosstab[1][0]}")
print(f"ratio for hazard b {data_crosstab[0][1]/data_crosstab[1][1]}")
print(f"ratio for hazard c {data_crosstab[0][2]/data_crosstab[1][2]}")

In [None]:
df.describe().T

In [None]:
df.groupby('class').mean().T

In [None]:
df.isnull().any()  # nessun true quindi nessun valore mancante

In [None]:
df.corr()

### HIDDEN MISSING VALUES

In [None]:
# Eliminiamo tutte le righe dove nbumps è zero e genergy è superiore al secondo quartile perché li riteniamo missing values
df_nbumps = df[df['nbumps'] == 0]
df_genergy = df_nbumps[df_nbumps['genergy'] >= df['genergy'].quantile(0.5)]
df_genergy

In [None]:
# Eliminiamo tutte le righe dove nbumps è zero e gpuls è superiore al secondo quartile perché li riteniamo missing values
df_nbumps = df[df['nbumps'] == 0]
df_gpuls = df_nbumps[df_nbumps['gpuls'] >= df['gpuls'].quantile(0.5)]
df_gpuls

In [None]:
df = df.drop(df_genergy.index, errors='ignore')
df = df.drop(df_gpuls.index, errors='ignore')
df.shape

In [None]:
df.corr()

In [None]:
plt.figure(figsize=[8, 8])
sns.heatmap(data=df.corr(), vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

## Inconsistency check

In [None]:
df.head()

In [None]:
selected_columns = df[['nbumps2','nbumps3','nbumps4', 'nbumps5']]
df_discrete = selected_columns.copy()
df[df['nbumps'] != df_discrete.sum(axis=1)] # ci sono due righe inconsistenti

In [None]:
# eliminiamo le due righe inconsistenti
df = df.drop(436)
df = df.drop(437)

In [None]:
df[df['maxenergy'] > df['energy']] # non ci sono inconsistenze tra energy e maxenergy

In [None]:
df_nbumps = df[df['nbumps'] > 0]
df_nbumps[df_nbumps['energy'] <= 0]
# non ci sono inconsistenze

In [None]:
df_nbumps = df[df['nbumps2'] > 0]
df_nbumps[df_nbumps['maxenergy'] < 10^2]
# non ci sono inconsistenze

In [None]:
df_nbumps = df[df['nbumps3'] > 0]
df_nbumps[df_nbumps['maxenergy'] < 10^3]
# non ci sono inconsistenze

In [None]:
df_nbumps = df[df['nbumps4'] > 0]
df_nbumps[df_nbumps['maxenergy'] < 10^4]
# non ci sono inconsistenze

In [None]:
df_nbumps = df[df['nbumps5'] > 0]
df_nbumps[df_nbumps['maxenergy'] < 10^5]
# non ci sono inconsistenze

## OUTLIER DETECTION

In [None]:
df['nbumps'].plot(kind = 'box')

In [None]:
df[df['nbumps'] >= 8] # quelli maggiori di 8 non sono outlier

In [None]:
df['nbumps2'].plot(kind = 'box') #from this output we can say that the majority of seismic bumps occurred with an energy in the range of 0 to 1

In [None]:
df[df['nbumps2'] >= 8] # (?) forse è un outlier perché è uno solo

In [None]:
df['nbumps3'].plot(kind = 'box')

In [None]:
df[df['nbumps3'] >= 7] # (?) forse è un outlier perché è uno solo

In [None]:
df['nbumps4'].plot(kind = 'box') 

In [None]:
df[df['nbumps4'] >= 3] # niente outliers

In [None]:
df['nbumps5'].plot(kind = 'box') 

In [None]:
df[df['nbumps5'] >= 1] # niente ouliers, nbumps5 è sempre o 0 o 1

In [None]:
df['genergy'].plot(kind = 'box') 

In [None]:
df[df['genergy'] >= 2.5] # niente outliers

In [None]:
df['gdenergy'].plot(kind = 'box') 

In [None]:
df[df['gdenergy'] >= 1000] # è un outlier perché è uno solo

In [None]:
df['gpuls'].plot(kind = 'box') 

In [None]:
df[df['gpuls'] >= 4000] # niente outliers

In [None]:
df['gdpuls'].plot(kind = 'box') 

In [None]:
df[df['gdpuls'] >= 800] # è un outlier perché è uno solo

In [None]:
selected_columns = df[['genergy','gpuls','gdenergy','gdpuls', 'energy', 'maxenergy']]
df_numerical = selected_columns.copy()
for e in df_numerical:
    print(e, stats.iqr(df_numerical[e]))

In [None]:
selected_columns = df[['nbumps','nbumps2','nbumps3','nbumps4', 'nbumps5']]
df_discrete = selected_columns.copy()
for e in df_discrete:
    print(e, stats.iqr(df_discrete[e]))

In [None]:
#function that detect outliers based on IQR 
def outlier_detector(df_clean, var):
    ''' function that calculates Q1,Q3,IQR and return the number of outliers of a given variable'''
    Q1=df_clean[var].quantile(0.25)
    Q3=df_clean[var].quantile(0.75)
    IQR=Q3-Q1
    
    print(var)
    print(f'Q1: {Q1}\nQ3: {Q3}\nIQR: {IQR}')
    
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    
    print(f'lower range: {lower_whisker}\nupper range: {upper_whisker}')
    df_lower = df_clean[df_clean[var] < lower_whisker]
    df_upper = df_clean[df_clean[var] > upper_whisker]
    outliers = sum(df_clean[var] < lower_whisker) + sum(df_clean[var] > upper_whisker)
    print(f'{var} has {outliers} potential outliers')
    return pd.concat([df_lower, df_upper])

In [None]:
df[df['nbumps'] > 0].shape

In [None]:
for e in df_numerical:
    print(df.shape)
    tmp = outlier_detector(df, e)
    print(tmp)
    print()
    df = df.drop(tmp.index)
    print(df.shape)
    

In [None]:
df[df['nbumps'] > 0].shape

In [None]:
for e in df_discrete:
    print(df.shape)
    tmp = outlier_detector(df, e)
    print(tmp)
    print()
    #df = df.drop(tmp.index) non li togliamo

In [None]:
df.shape

In [None]:
df[df['nbumps'] > 0].shape

In [None]:
df_numerical.boxplot(return_type = 'dict', figsize=(13,5),vert=False)
plt.plot()

In [None]:
df_discrete.boxplot(return_type = 'dict', figsize=(13,5),vert=False)
plt.plot()

In [None]:
df.corr()

In [None]:
df = df.drop('nbumps4', 1)
df = df.drop('nbumps5', 1) 

col_list_normal = ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'energy', 'maxenergy']
col_list_transform = ['log_genergy', 'log_gpuls', 'zscore_gdenergy', 'zscore_gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'shifted_log_energy', 'shifted_log_maxenergy']
col_list_discrete = ['nbumps', 'nbumps2', 'nbumps3']

## TRANSFORMATIONS

### NUMERICAL

In [None]:
plt.figure(figsize=[8, 8])
sns.heatmap(data=df[col_list_numerical].corr(), vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

### Siccome genergy e gpuls hanno un'alta correlazione, vediamo come sono distruibuti rispetto a class

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='genergy', y='gpuls', hue='class', data=df)

In [None]:
# Applying log transform

df['log_genergy'] = np.log(df['genergy'])
df['log_gpuls'] = np.log(df['gpuls'])

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='log_genergy', y='log_gpuls', hue='class', data=df)

#### After applying the log transf we can better see a positive correlation between gpuls and genergy. That could indicate that the number of puls increases when the energy does, so the number of pulse can be used as and indicator of the energy intensity. (The pulses have been registered by the most active geophone (Gmax), which is a sensor designed to detect ground movements).


### Siccome gdenergy e gdpuls hanno un'alta correlazione, vediamo come sono distruibuti rispetto a class

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='gdenergy', y='gdpuls', hue='class', data=df)

##### There are negative shifts in gdenergy and gdpuls because of which log transformation is not going to be the best choice. So, we use zscore transformation (also because we already have two deviations).

In [None]:
# Applying score transform

df['zscore_gdenergy'] = stats.zscore(df['gdenergy'])
df['zscore_gdpuls'] = stats.zscore(df['gdpuls'])

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='zscore_gdenergy', y='zscore_gdpuls', hue='class', data=df)

### Siccome energy e maxenergy hanno un'alta correlazione, vediamo come sono distruibuti rispetto a class

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='energy', y='maxenergy', hue='class', data=df)

##### There is a lot of zeroes in this two feature, so we trying log transformation after adding a constant 1

In [None]:
df['shifted_log_energy'] = np.log(df['energy']+1)
df['shifted_log_maxenergy'] = np.log(df['maxenergy']+1)

In [None]:
plt.figure(figsize=[10, 8])
sns.scatterplot(x='shifted_log_energy', y='shifted_log_maxenergy', hue='class', data=df)

## LOG TRANSFORMED HISTPLOTS

In [None]:
sns.histplot(np.log(df['genergy']), kde=True, stat="density", linewidth=0)

In [None]:
sns.histplot(np.log(df['gpuls']), kde=True, stat="density", linewidth=0)

## NBUMPS CROSSTAB

In [None]:
plt.figure(figsize=[8, 8])
sns.heatmap(data=df[col_list_discrete].corr(), vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

### Since bumps are the number of seismic bumps recorded within the previous shift, it makes sense to check the entity of these bumps, so in which class they fall (0 = non hazardous; 1 = hazardous). We can check it with a crosstab

In [None]:
for col in col_list_discrete:
    bumps_crosstab = pd.crosstab(df[col], df['class'], colnames=['class'], margins = True) #margins is to add the totals
    print(bumps_crosstab)
    print('-----')

### Crosstab with the percentages, to better see the percentage of 0s in comparison to the 1s

In [None]:
for col in col_list_discrete:
    bumps_crosstab = pd.crosstab(df[col], df['class'], colnames=['class'], margins = True).apply(lambda r: r/r.sum(), axis = 1) #margins is to add the totals
    print(bumps_crosstab)
    print('-----')

In [None]:
pclass_xt = pd.crosstab(df['class'], df['nbumps'])
pclass_xt_pct = pclass_xt.div(pclass_xt.sum(1).astype(float), axis=0)  #li trasformo in modo che il valore vada da 0 a 1
pclass_xt_pct.plot(kind='bar', stacked=True, 
                   title='nbumps for class')
plt.xlabel('class')
plt.ylabel('nbumps')
plt.show()

In [None]:
classandseismic = pd.crosstab(df['class'], df['seismic'])
classandseismic = classandseismic.div(classandseismic.sum(1).astype(float), axis=0)
classandseismic.plot(kind='bar', stacked=True, 
                   title='seismic for class')
plt.xlabel('class')
plt.ylabel('seismic')
plt.show()

In [None]:
classnbumps2=pd.crosstab(df['class'], df['nbumps2'])   #trasformo le crosstab in valori da 1 a 0
classnbumps3=pd.crosstab(df['class'], df['nbumps3'])
classgenergy= pd.crosstab(df['class'], [df['gdenergy'].quantile(0), df['gdenergy'].quantile(0.25), df['gdenergy'].quantile(0.50), df['gdenergy'].quantile(0.75), df['gdenergy'].quantile(1)])
classgpuls= pd.crosstab(df['class'], [df['gpuls'].quantile(0), df['gpuls'].quantile(0.25), df['gpuls'].quantile(0.50), df['gpuls'].quantile(0.75), df['gpuls'].quantile(1)])
classnbumps2=classnbumps2.div(classnbumps2.sum(1).astype(float), axis=0)
classnbumps3=classnbumps3.div(classnbumps3.sum(1).astype(float), axis=0)
classgenergy=classgenergy.div(classgenergy.sum(1).astype(float), axis=0)
classgpuls=classgpuls.div(classgpuls.sum(1).astype(float), axis=0)


In [None]:
classnbumps2.plot(kind='bar', stacked=True, 
                   title='nbumps2 for class')
plt.xlabel('class')
plt.ylabel('nbumps2')
plt.show()

In [None]:
classnbumps3.plot(kind='bar', stacked=True, 
                   title='nbumps3 for class')
plt.xlabel('class')
plt.ylabel('nbumps3')
plt.show()

In [None]:
classgenergy.plot(kind='bar', stacked=True, 
                   title='genergy for class')
plt.xlabel('class')
plt.ylabel('genergy')
plt.show()

In [None]:
classgpuls.plot(kind='bar', stacked=True, 
                   title='gpuls for class')
plt.xlabel('class')
plt.ylabel('gpuls')
plt.show()

#### the crosstab highlight that the majority of bumps - also the ones which registered an energy intensity in the upperbound of the range - have been classified as 'non-harzardous state', since belonging to the class 0. 

## CORRELATION AND HEATMAP

In [None]:
df.corr()

In [None]:
dfcorrelazione = df.corr()
plt.figure(figsize=[20, 20])
sns.heatmap(data=dfcorrelazione, vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

In [None]:
col_list_normal = ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'energy', 'maxenergy', 'class']
col_list_transform = ['log_genergy', 'log_gpuls', 'zscore_gdenergy', 'zscore_gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'shifted_log_energy', 'shifted_log_maxenergy', 'class']

In [None]:
df[col_list_normal].corr()

In [None]:
plt.figure(figsize=[10, 10])
sns.heatmap(data=df[col_list_normal].corr(), vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

In [None]:
df[col_list_transform].corr()

In [None]:
plt.figure(figsize=[10, 10])
sns.heatmap(data=df[col_list_transform].corr(), vmin=-1, vmax=1, cmap='gist_earth_r', annot=True, square=True, linewidths=1)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

In [None]:
col_list_normal.remove('class')
col_list_transform.remove('class')

In [None]:
df_x_normal = col_list_normal.copy()
df_x_transform = col_list_transform.copy()
X_normal = df[col_list_normal].values
X_transform = df[col_list_transform].values

In [None]:
scaler = StandardScaler()
X_normal_scal = scaler.fit_transform(X_normal)
X_transform_scal = scaler.fit_transform(X_transform)

In [None]:
plt.hist(X_normal_scal[:,2], edgecolor='white')

plt.xticks(fontsize=20)
plt.show()

In [None]:
plt.hist(X_transform_scal[:,2], edgecolor='white')

plt.xticks(fontsize=20)
plt.show()

In [None]:
scaler = MinMaxScaler()
X_normal_minmax = scaler.fit_transform(X_normal)
X_transform_minmax = scaler.fit_transform(X_transform)

In [None]:
plt.hist(X_normal_minmax[:,2], edgecolor='white')

plt.xticks(fontsize=20)
plt.show()

In [None]:
plt.hist(X_transform_minmax[:,2], edgecolor='white')

plt.xticks(fontsize=20)
plt.show()

## KMeans

In [None]:
clust_name = ['0', '1']

### KMeans on normal dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_normal)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_normal, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_normal)

In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal[:,i], X_normal[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_normal, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_normal)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

### KMeans on normal StandardScaled dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_normal_scal)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_normal_scal, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_normal_scal)

In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal_scal[:,i], X_normal_scal[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
df = df.drop('kmeans_labels', 1)
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_normal_scal, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_normal_scal)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

### KMeans on normal MinMaxScaled dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_normal_minmax)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_normal_minmax, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_normal_minmax)

In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal_minmax[:,i], X_normal_minmax[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
df = df.drop('kmeans_labels', 1)
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_normal_minmax, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_normal_minmax)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

### KMeans on transform dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_transform)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_transform, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_transform)

In [None]:
for i in range (0, len(col_list_transform)-1):
    for j in range (0, len(col_list_transform)-1):
        plt.scatter( X_transform[:,i], X_transform[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_transform[i])
        print(col_list_transform[j])

        plt.show()

In [None]:
df = df.drop('kmeans_labels', 1)
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_transform, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_transform)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

### KMeans on transform scaled dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_transform_scal)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_transform_scal, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_transform_scal)

In [None]:
for i in range (0, len(col_list_transform)-1):
    for j in range (0, len(col_list_transform)-1):
        plt.scatter( X_transform_scal[:,i], X_transform_scal[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_transform[i])
        print(col_list_transform[j])

        plt.show()

In [None]:
df = df.drop('kmeans_labels', 1)
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_transform_scal, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_transform_scal)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

### KMeans on transform MinMaxScaled dataset

In [None]:
# Find the best K for KMeans
k_to_test = range(2,25,1) # [2,3,4, ..., 24]
silhouette_scores = {}

for k in k_to_test:
    kmeans = KMeans( n_clusters = k )
    kmeans.fit(X_transform_minmax)
    labels_k = kmeans.labels_
    score_k = silhouette_score(X_transform_minmax, labels_k)
    silhouette_scores[k] = score_k
    print(f"Tested kMeans with k = {k}\tSS: {score_k}\tSSE: {kmeans.inertia_}")
    
print("Done!")

In [None]:
kmeans = KMeans( n_clusters=2, n_init=10, max_iter=100 )
kmeans.fit(X_transform_minmax)

In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_transform_minmax[:,i], X_transform_minmax[:,j], s=40, edgecolor='black', c= kmeans.labels_ )
        plt.scatter( kmeans.cluster_centers_[:,i], kmeans.cluster_centers_[:,j], c='red', marker='*', s=200 )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
df = df.drop('kmeans_labels', 1)
df['kmeans_labels'] = kmeans.labels_

bar_pl = df['class'].groupby(df['kmeans_labels']).value_counts(normalize=True).unstack(1)
bar_pl.plot(kind='bar', stacked=True, alpha=0.8, edgecolor='white', linewidth=1.5)

plt.xticks(range(0, len(clust_name)), clust_name, fontsize=20, rotation=0)
plt.yticks(fontsize=20)
plt.legend(bbox_to_anchor=(1,1))

plt.plot()

In [None]:
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(X_transform_minmax, kmeans.labels_))

In [None]:
sse_list = []

for k in range(2, 51):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(X_transform_minmax)
    sse_list.append( kmeans.inertia_ )

In [None]:
plt.plot(range(2, len(sse_list)+2), sse_list, marker='*')
plt.ylabel('SSE')
plt.show()

## DBSCAN

In [None]:
df_X_normal = pd.DataFrame(X_normal)
df_X_normal = df_X_normal.rename(columns={0:'genergy', 1:'gpuls', 2:'gdenergy', 3:'gdpuls', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'energy', 8:'maxenergy'})
df_X_normal_scal = pd.DataFrame(X_normal_scal)
df_X_normal_scal = df_X_normal_scal.rename(columns={0:'genergy', 1:'gpuls', 2:'gdenergy', 3:'gdpuls', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'energy', 8:'maxenergy'})
df_X_normal_minmax = pd.DataFrame(X_normal_minmax)
df_X_normal_minmax = df_X_normal_minmax.rename(columns={0:'genergy', 1:'gpuls', 2:'gdenergy', 3:'gdpuls', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'energy', 8:'maxenergy'})

df_X_transform = pd.DataFrame(X_transform)
df_X_transform = df_X_transform.rename(columns={0:'log_genergy', 1:'log_gpuls', 2:'zscore_gdenergy', 3:'zscore_gdenergy', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'shifted_log_energy', 8:'shifted_log_maxenergy'})
df_X_transform_scal = pd.DataFrame(X_transform_scal)
df_X_transform_scal = df_X_transform_scal.rename(columns={0:'log_genergy', 1:'log_gpuls', 2:'zscore_gdenergy', 3:'zscore_gdenergy', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'shifted_log_energy', 8:'shifted_log_maxenergy'})
df_X_transform_minmax = pd.DataFrame(X_transform_minmax)
df_X_transform_minmax = df_X_transform_minmax.rename(columns={0:'log_genergy', 1:'log_gpuls', 2:'zscore_gdenergy', 3:'zscore_gdenergy', 4:'nbumps', 5:'nbumps2', 6:'nbumps3', 7:'shifted_log_energy', 8:'shifted_log_maxenergy'})

In [None]:
eps_to_test = [round(eps,1) for eps in np.arange(0.1, 2, 0.1)]
min_samples_to_test = range(5, 50, 5)

print("EPS:", eps_to_test) # distanza minima tra i punti per essere nello stesso cluster
print("MIN_SAMPLES:", list(min_samples_to_test)) #numero di punti minimo per formare un cluster

In [None]:
def get_metrics(eps, min_samples, dataset, iter_):
    
    # Fitting ======================================================================
    
    dbscan_model_ = DBSCAN( eps = eps, min_samples = min_samples)
    dbscan_model_.fit(dataset)
    
    # Mean Noise Point Distance metric =============================================
    noise_indices = dbscan_model_.labels_ == -1 # prende tutti i punti classificati come noise
    
    if True in noise_indices:
        neighboors = NearestNeighbors(n_neighbors = 6).fit(dataset)
        distances, indices = neighboors.kneighbors(dataset)
        noise_distances = distances[noise_indices, 1:]
        noise_mean_distance = round(noise_distances.mean(), 3)
    else:
        noise_mean_distance = None
        
    # Number of found Clusters metric ==============================================
    
    number_of_clusters = len(set(dbscan_model_.labels_[dbscan_model_.labels_ >= 0]))
    
    # Log ==========================================================================
    
    if number_of_clusters == 2:
        print("%3d | Tested with eps = %3s and min_samples = %3s | %5s %4s" % (iter_, eps, min_samples, str(noise_mean_distance), number_of_clusters))
        
    return(noise_mean_distance, number_of_clusters)

In [None]:
# Dataframe per la metrica sulla distanza media dei noise points dai K punti più vicini
results_noise = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

# Dataframe per la metrica sul numero di cluster
results_clusters = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

### DBSCAN on normal dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_normal, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=0.3, min_samples=5)
dbscan.fit(X_normal)

In [None]:
# Extracting labels
df_X_normal["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_normal, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal[:,i], X_normal[:,j], s=40, edgecolor='black', c= dbscan.labels_ )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
dist = pdist(X_normal, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

### DBSCAN on normal scaled dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_normal_scal, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=1.4, min_samples=30)
dbscan.fit(X_normal_scal)

In [None]:
# Extracting labels
df_X_normal_scal["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_normal_scal, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal_scal[:,i], X_normal_scal[:,j], s=40, edgecolor='black', c= dbscan.labels_ )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
dist = pdist(X_normal_scal, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

### DBSCAN on normal MinMaxScaled dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_normal_minmax, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=0.8, min_samples=15)
dbscan.fit(X_normal_minmax)

In [None]:
# Extracting labels
df_X_normal_minmax["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_normal_minmax, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_normal)-1):
    for j in range (0, len(col_list_normal)-1):
        plt.scatter( X_normal_minmax[:,i], X_normal_minmax[:,j], s=40, edgecolor='black', c= dbscan.labels_ )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_normal[i])
        print(col_list_normal[j])

        plt.show()

In [None]:
dist = pdist(X_normal_minmax, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

### DBSCAN on transform dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_transform, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=1.3, min_samples=35)
dbscan.fit(X_transform)

In [None]:
# Extracting labels
df_X_transform["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_transform, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_transform)-1):
    for j in range (0, len(col_list_transform)-1):
        plt.scatter( X_transform[:,i], X_transform[:,j], s=40, edgecolor='black', c= dbscan.labels_ )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_transform[i])
        print(col_list_transform[j])

        plt.show()

In [None]:
dist = pdist(X_transform, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

### DBSCAN on transform scaled dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_transform_scal, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=1.3, min_samples=40)
dbscan.fit(X_transform_scal)

In [None]:
# Extracting labels
df_X_transform_scal["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_transform_scal, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_transform)-1):
    for j in range (0, len(col_list_transform)-1):
        plt.scatter( X_transform_scal[:,i], X_transform_scal[:,j], s=40, edgecolor='black', c= dbscan.labels_ )

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_transform[i])
        print(col_list_transform[j])

        plt.show()

In [None]:
dist = pdist(X_transform_scal, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

### DBSCAN on transform MinMaxScaled dataset

In [None]:
iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, X_transform_minmax, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

In [None]:
dbscan = DBSCAN( eps=0.5, min_samples=35)
dbscan.fit(X_transform_minmax)

In [None]:
# Extracting labels
df_X_transform_minmax["LABEL"] = dbscan.labels_

# Pairplot
sns.pairplot(df_X_transform_minmax, hue="LABEL", diag_kind='hist');
plt.show()


In [None]:
for i in range (0, len(col_list_transform)-1):
    for j in range (0, len(col_list_transform)-1):
        plt.scatter( X_transform_minmax[:,i], X_transform_minmax[:,j], s=40, edgecolor='black', c= dbscan.labels_)

        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        print(col_list_transform[i])
        print(col_list_transform[j])

        plt.show()

In [None]:
dist = pdist(X_transform_minmax, 'euclidean')
dist = squareform(dist)

k=5
kth_distances = []
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])

In [None]:
plt.plot(range(0, len(kth_distances)), sorted(kth_distances))
plt.ylabel('dist from th 5th neighbor')
         
plt.show()

In [None]:
data_dist = pdist(X_transform_minmax, metric='euclidean')
data_link = linkage(data_dist, method='complete')

res = dendrogram(data_link, truncate_mode='lastp')