# Рубежный контроль №2

**Лясковский Максим Альбертович**, группа ИУ5-24М.Вариант №2.

## Задание
Необходимо решить задачу кластеризации на основе любого выбранного Вами датасета.

Кластеризовать данные с помощью трех различных алгоритмов кластеризации. Алгоритмы выбираются произвольным образом, рекомендуется использовать алгоритмы из лекции.

Сравнить качество кластеризации для трех алгоритмов с помощью следующих метрик качества кластеризации:

1. Adjusted Rand index
2. Adjusted Mutual Information
3. Homogeneity, completeness, V-measure
4. Коэффициент силуэта

Сделать выводы о том, какой алгоритм осуществляет более качественную кластеризацию на Вашем наборе данных.

In [44]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_completeness_v_measure
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from itertools import cycle, islice
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering

# Enable inline plots
%matplotlib inline

# Set plots formats to save high resolution PNG
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

Зададим ширину текстового представления данных, чтобы в&nbsp;дальнейшем текст в&nbsp;отчёте влезал на&nbsp;А4<cite data-cite="doc:pandas"></cite>:

In [8]:
pd.set_option("display.width", 70)

In [9]:
# Загружаем датасет
data = pd.read_csv('data/heart-disease-uci.zip')

In [10]:
# Первые 5 строк датасета
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
# размер набора данных
data.shape

(303, 14)

In [12]:
# типы колонок
data.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [13]:
# проверим есть ли пропущенные значения
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [17]:
df = data.copy()
X = df.drop("target", axis=1)
y = df["target"]

In [14]:
def visualize_clusters(cluster_datasets, cluster_results):
    """
    Визуализация результатов кластерного анализа
    """
    plt.subplots(figsize=(10,7))
    plot_num = 0
    for X, y_pred in zip(cluster_datasets, cluster_results):
        plot_num += 1
        plt.subplot(2, 3, plot_num)
        # Цвета точек как результат кластеризации
        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        # черный цвет для выделяющихся значений
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=3, color=colors[y_pred])
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.title(datasets_names[plot_num-1])
    
    plt.show()

In [22]:
def do_clustering(cluster_datasets, method):
    """
    Выполнение кластеризации для данных примера
    """
    cluster_results = []
    for X in cluster_datasets:
        temp_cluster = method.fit_predict(X)
        cluster_results.append(temp_cluster)
    return cluster_results

In [15]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def claster_metrics(method, cluster_datasets, cluster_true_y, datasets_names):
    """
    Вычисление метрик кластеризации
    """
    ari = []
    ami = []
    hl = []
    cl = []
    vl = []
    sl = []
    for X, true_y in zip(cluster_datasets, cluster_true_y):
        temp_cluster = method.fit_predict(X)
        ari.append(adjusted_rand_score(true_y, temp_cluster))
        ami.append(adjusted_mutual_info_score(true_y, temp_cluster))
    
        h, c, v = homogeneity_completeness_v_measure(true_y, temp_cluster)
        hl.append(h)
        cl.append(c)
        vl.append(v)
    
        sl.append(silhouette_score(X, temp_cluster))
    
    result = pd.DataFrame({'Datasets':datasets_names[0:5], 
                           'ARI':ari, 'AMI':ami, 
                           'Homogeneity':hl, 
                           'Completeness':cl, 
                           'V-measure':vl, 'Silhouette':sl})
    return result

In [24]:
%time result_KMeans_3 = do_clustering([X], KMeans(n_clusters=3))

CPU times: user 35.2 ms, sys: 4.36 ms, total: 39.5 ms
Wall time: 36.8 ms


In [25]:
%time result_KMeans_5 = do_clustering([X], KMeans(n_clusters=5))

CPU times: user 41.3 ms, sys: 2.26 ms, total: 43.6 ms
Wall time: 40.8 ms


In [26]:
%time result_KMeans_plus_3 = do_clustering([X], KMeans(n_clusters=3, init='k-means++'))

CPU times: user 39 ms, sys: 949 µs, total: 40 ms
Wall time: 38.4 ms


In [27]:
%time result_KMeans_batch_3 = do_clustering([X], MiniBatchKMeans(n_clusters=3))

CPU times: user 23.5 ms, sys: 3.96 ms, total: 27.4 ms
Wall time: 25 ms


In [29]:
# Вычисление метрик для KMeans
claster_metrics(KMeans(n_clusters=3), [X], [y], ['heart deseases'])

Unnamed: 0,Datasets,ARI,AMI,Homogeneity,Completeness,V-measure,Silhouette
0,heart deseases,0.008493,0.00534,0.012785,0.008552,0.010249,0.287765


In [36]:
# Вычисление метрик для KMeans
claster_metrics(KMeans(n_clusters=5), [X], [y], ['heart deseases'])

Unnamed: 0,Datasets,ARI,AMI,Homogeneity,Completeness,V-measure,Silhouette
0,heart deseases,0.033312,0.022156,0.055362,0.026928,0.036233,0.277222


In [38]:
%time result_AffinityPropagation = do_clustering([X], AffinityPropagation())

CPU times: user 106 ms, sys: 415 ms, total: 521 ms
Wall time: 44.8 ms


In [39]:
claster_metrics(AffinityPropagation(), [X], [y], ['heart deseases'])

Unnamed: 0,Datasets,ARI,AMI,Homogeneity,Completeness,V-measure,Silhouette
0,heart deseases,0.019729,0.030236,0.176111,0.042314,0.068234,0.232144


In [41]:
%time result_MeanShift = do_clustering([X], MeanShift())

CPU times: user 899 ms, sys: 0 ns, total: 899 ms
Wall time: 898 ms


In [42]:
claster_metrics(MeanShift(), [X], [y], ['heart deseases'])

Unnamed: 0,Datasets,ARI,AMI,Homogeneity,Completeness,V-measure,Silhouette
0,heart deseases,0.000244,-0.002845,7.7e-05,0.000759,0.00014,0.63859


In [46]:
%time result_SpectralClustering = do_clustering([X], SpectralClustering())

CPU times: user 329 ms, sys: 1.41 s, total: 1.73 s
Wall time: 273 ms


  n_init=n_init)


In [47]:
claster_metrics(SpectralClustering(), [X], [y], ['heart deseases'])

  n_init=n_init)


Unnamed: 0,Datasets,ARI,AMI,Homogeneity,Completeness,V-measure,Silhouette
0,heart deseases,0.002792,0.001035,0.014254,0.110949,0.025263,-0.102647
