In [57]:
#импорт библиотек
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.semi_supervised import LabelSpreading
from joblib import Parallel, delayed

In [58]:

def dtw(x, y, dist):
    """
    функция для корреляции записей
    
    х - один сигнал по всем каналам (на который идет выравнивание)
    у - второй сигнал по всем каналам (который выравнивается по первому)
    dist - расстояние по преобразованию (смещение)

    """
    
    if np.ndim(x) == 1:
        x = x.reshape(-1, 1)
    if np.ndim(y) == 1:
        y = y.reshape(-1, 1)
    r, c = len(x), len(y)
    D0 = np.zeros((r + 1, c + 1))
    D0[0, 1:] = np.inf
    D0[1:, 0] = np.inf
    D1 = D0[1:, 1:]
    D0[1:, 1:] = cdist(x, y, dist)
    C = D1.copy()
    for i in range(r):
        for j in range(c):
            min_list = [D0[i, j]]
            min_list += [D0[min(i + 1, r), j],
                            D0[i, min(j + 1, c)]]
            D1[i, j] += min(min_list)
    if len(x) == 1:
        path = np.zeros(len(y)), range(len(y))
    elif len(y) == 1:
        path = range(len(x)), np.zeros(len(x))
    else:
        path = _traceback(D0)
    return D1[-1, -1], C, D1, path

def _traceback(D):
    i, j = np.array(D.shape) - 2
    p, q = [i], [j]
    while (i > 0) or (j > 0):
        tb = np.argmin((D[i, j], D[i, j + 1], D[i + 1, j]))
        if tb == 0:
            i -= 1
            j -= 1
        elif tb == 1:
            i -= 1
        else:
            j -= 1
        p.insert(0, i)
        q.insert(0, j)
    return np.array(p), np.array(q)

In [59]:
#количество каналов сигналов
samples_count = 5000

#формирование заголовков столбцов таблицы
c = ['name', 'x', 'y']
for i in range(0, samples_count):
    c.append(f'v{i}')
c = c + ['cluster', 'p0', 'p1', 'p2', 'p3']

In [60]:
# загрузка данных из файла и замена индексов
data = pd.read_csv('signals.csv', names=c, dtype=np.float32)
data = data.set_index('name', drop=True)

In [61]:
#выделение из таблицы части данных с каналами для дальнейшего присвоения кластера
data_train = data.loc[:, :'v4999']

In [62]:
labels = data['cluster']

In [63]:
# обучение модели LabelSpreading для присвоения кластеров
label_spread = LabelSpreading(kernel="knn", n_neighbors=30, alpha=0.1, max_iter=150)
label_spread.fit(data_train, labels)

LabelSpreading(alpha=0.1, kernel='knn', max_iter=150, n_neighbors=30)

In [64]:
#присвоение кластеров
data_new = data.loc[:, :'v4999']
data_new['cluster'] = label_spread.transduction_

  data_new['cluster'] = label_spread.transduction_


In [65]:
data['cluster'] = label_spread.transduction_

In [66]:
#группировка по кластерам
data_new_clus = data_new.groupby('cluster')
data_cluster = data.groupby('cluster')

In [67]:

"""
визуализация результата

fig, ax = plt.subplots(figsize=(20, 15))
data_new_clus.plot(x='x', y='y', style='.', ax=ax, markersize=14)
ax.legend(data_new_clus.groups.keys())

"""

"\nвизуализация результата\n\nfig, ax = plt.subplots(figsize=(20, 15))\ndata_new_clus.plot(x='x', y='y', style='.', ax=ax, markersize=14)\nax.legend(data_new_clus.groups.keys())\n\n"

In [68]:
#создание DataFrame для поиска ближайщих точек
data1 = data.copy()
data1[['p0_name', 'p1_name', 'p2_name', 'p3_name']] = 0.0

In [13]:
for cluster_num, group_cluster in data_cluster:
    """
    
    функция поиска ближайщих размеченных точек для каждой неразмеченной
    по каждому параметру ['p0', 'p1', 'p2', 'p3'] для неразмеченной точки из non_labeled происходит поиск ближайщей размеченной точки из labeled
    на вход передаются кластеры

    """
    
    for p in ['p0', 'p1', 'p2', 'p3']:
        labeled = group_cluster[group_cluster[p] != -1]
        labeled_x_y = labeled[['x', 'y']].values

        non_labeled = group_cluster[group_cluster[p] == -1]
        non_labeled_x_y = non_labeled[['x', 'y']].values

        distance_matrix = cdist(labeled_x_y, non_labeled_x_y)
        index = distance_matrix.argmin(axis=0)

        labeled_names = labeled.index[index]
        non_labeled_names = non_labeled.index

        data1.loc[non_labeled.index, p + '_name'] = labeled_names

In [14]:
def cluster_work(data1, p):

    """
    
    реализация функции dfw (функция расположена в коде)
    для всех точек по параметрам ['p0', 'p1', 'p2', 'p3']

    в функцию передается DataFrame и параметр
    
    """
    list_value = list()

    for index, row in data1.iterrows():
        if row[p + '_name'] != 0: #если запись не размечена, то идет вычисление параметра
            
            #х относится к разчеменному образцу, у - к неразмеченному
            x = data1.loc[row[p+'_name'], 'v0':'v4999'].values
            y = row['v0':'v4999'].values

            distance, cost, accum_cost, path = dtw(x, y, 'euclidean')
            x_labeled_new = np.where(path[0] == data1.loc[row[p + '_name'], p])[0][0]
            
            list_value.append((path[1][x_labeled_new] , index, p))

    return list_value

In [17]:
data_claster = data1[data1['cluster'] == 4]
data_claster

Unnamed: 0_level_0,x,y,v0,v1,v2,v3,v4,v5,v6,v7,...,v4999,cluster,p0,p1,p2,p3,p0_name,p1_name,p2_name,p3_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
314.0,2616.625977,4277.071777,0.946,0.943,0.94,0.937,0.935,0.933,0.932,0.931,...,0.984,4.0,-1.0,-1.0,-1.0,-1.0,746.0,104.0,746.0,188.0
375.0,3095.245117,4185.436035,0.936,0.931,0.926,0.926,0.925,0.922,0.92,0.92,...,0.978,4.0,-1.0,-1.0,-1.0,-1.0,746.0,104.0,746.0,188.0
122.0,3396.368896,3485.878906,0.967,0.964,0.962,0.96,0.958,0.958,0.958,0.957,...,0.979,4.0,-1.0,-1.0,-1.0,-1.0,746.0,613.0,746.0,613.0
560.0,3279.38208,3365.305908,0.728,0.733,0.736,0.739,0.741,0.743,0.745,0.746,...,0.957,4.0,-1.0,-1.0,-1.0,-1.0,746.0,613.0,746.0,613.0
544.0,3178.600098,2460.563965,0.737,0.747,0.756,0.759,0.762,0.765,0.768,0.772,...,0.987,4.0,394.0,-1.0,3481.0,-1.0,0.0,613.0,0.0,613.0
143.0,2520.496094,4024.799072,0.865,0.866,0.867,0.87,0.873,0.877,0.881,0.886,...,1.0,4.0,-1.0,-1.0,-1.0,-1.0,746.0,613.0,746.0,188.0
209.0,2824.539062,4547.60791,0.715,0.715,0.714,0.713,0.71,0.707,0.705,0.704,...,0.966,4.0,-1.0,-1.0,-1.0,-1.0,746.0,104.0,746.0,104.0
13.0,3132.231934,3834.835938,0.889,0.89,0.893,0.898,0.902,0.906,0.909,0.911,...,0.997,4.0,-1.0,-1.0,-1.0,-1.0,746.0,613.0,746.0,613.0
93.0,2568.988037,4560.680176,0.629,0.632,0.638,0.648,0.661,0.676,0.694,0.714,...,0.886,4.0,-1.0,-1.0,-1.0,-1.0,746.0,104.0,746.0,104.0
30.0,2806.860107,3512.833984,0.841,0.84,0.841,0.841,0.841,0.839,0.837,0.838,...,0.961,4.0,-1.0,-1.0,-1.0,-1.0,746.0,613.0,746.0,613.0


In [18]:
#Функция к сожалению работает очень медленно, для некоторого ускорения проводится параллелизация процессов вычисления по разным параметрам
#На расчет уходит много времени! Одна запись в DataFrame считается примерно 26 с.
result_lists = Parallel(n_jobs=5)(delayed(cluster_work)(data_claster, p) for p in ['p0', 'p1', 'p2', 'p3'])

In [19]:
result_lists

[[(141, 314.0, 'p0'),
  (79, 375.0, 'p0'),
  (152, 122.0, 'p0'),
  (283, 560.0, 'p0'),
  (163, 143.0, 'p0'),
  (105, 209.0, 'p0'),
  (166, 13.0, 'p0'),
  (8, 93.0, 'p0'),
  (103, 30.0, 'p0'),
  (434, 188.0, 'p0'),
  (231, 104.0, 'p0'),
  (0, 687.0, 'p0'),
  (155, 201.0, 'p0'),
  (0, 613.0, 'p0'),
  (85, 704.0, 'p0'),
  (226, 561.0, 'p0'),
  (231, 115.0, 'p0'),
  (0, 678.0, 'p0'),
  (221, 467.0, 'p0'),
  (231, 245.0, 'p0'),
  (385, 754.0, 'p0'),
  (46, 364.0, 'p0'),
  (231, 328.0, 'p0'),
  (0, 159.0, 'p0'),
  (167, 371.0, 'p0'),
  (686, 181.0, 'p0'),
  (437, 172.0, 'p0')],
 [(3571, 314.0, 'p1'),
  (3800, 375.0, 'p1'),
  (2253, 122.0, 'p1'),
  (3751, 560.0, 'p1'),
  (2060, 544.0, 'p1'),
  (2953, 143.0, 'p1'),
  (3818, 209.0, 'p1'),
  (3843, 13.0, 'p1'),
  (2990, 93.0, 'p1'),
  (3198, 30.0, 'p1'),
  (2848, 188.0, 'p1'),
  (3903, 746.0, 'p1'),
  (3078, 687.0, 'p1'),
  (2455, 201.0, 'p1'),
  (3774, 704.0, 'p1'),
  (3583, 561.0, 'p1'),
  (4497, 115.0, 'p1'),
  (2913, 678.0, 'p1'),
  (221, 46

In [82]:
print(data1[data1['cluster'] == 0].shape)
print(data1[data1['cluster'] == 1].shape)
print(data1[data1['cluster'] == 2].shape)
print(data1[data1['cluster'] == 3].shape)
print(data1[data1['cluster'] == 4].shape)
print(data1[data1['cluster'] == 5].shape)
print(data1[data1['cluster'] == 6].shape)
print(data1[data1['cluster'] == 7].shape)
print(data1[data1['cluster'] == 8].shape)


(169, 5011)
(31, 5011)
(39, 5011)
(104, 5011)
(29, 5011)
(126, 5011)
(62, 5011)
(113, 5011)
(87, 5011)


In [75]:
for lists in result_lists:
    for value, index, parametr in lists:
        data.loc[data.index == index, parametr] = value

In [76]:
data[data['cluster'] == 4]

Unnamed: 0_level_0,x,y,v0,v1,v2,v3,v4,v5,v6,v7,...,v4995,v4996,v4997,v4998,v4999,cluster,p0,p1,p2,p3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
314.0,2616.625977,4277.071777,0.946,0.943,0.94,0.937,0.935,0.933,0.932,0.931,...,0.984,0.984,0.984,0.984,0.984,4.0,141.0,3571.0,3659.0,4658.0
375.0,3095.245117,4185.436035,0.936,0.931,0.926,0.926,0.925,0.922,0.92,0.92,...,0.975,0.977,0.979,0.979,0.978,4.0,79.0,3800.0,4075.0,4499.0
122.0,3396.368896,3485.878906,0.967,0.964,0.962,0.96,0.958,0.958,0.958,0.957,...,0.984,0.983,0.983,0.981,0.979,4.0,152.0,2253.0,432.0,4156.0
560.0,3279.38208,3365.305908,0.728,0.733,0.736,0.739,0.741,0.743,0.745,0.746,...,0.957,0.957,0.957,0.957,0.957,4.0,283.0,3751.0,3901.0,4627.0
544.0,3178.600098,2460.563965,0.737,0.747,0.756,0.759,0.762,0.765,0.768,0.772,...,0.98,0.981,0.982,0.985,0.987,4.0,394.0,2060.0,3481.0,4637.0
143.0,2520.496094,4024.799072,0.865,0.866,0.867,0.87,0.873,0.877,0.881,0.886,...,0.995,0.996,0.997,0.999,1.0,4.0,163.0,2953.0,3677.0,4658.0
209.0,2824.539062,4547.60791,0.715,0.715,0.714,0.713,0.71,0.707,0.705,0.704,...,0.969,0.968,0.966,0.966,0.966,4.0,105.0,3818.0,3895.0,4285.0
13.0,3132.231934,3834.835938,0.889,0.89,0.893,0.898,0.902,0.906,0.909,0.911,...,0.997,0.997,0.997,0.997,0.997,4.0,166.0,3843.0,3818.0,4756.0
93.0,2568.988037,4560.680176,0.629,0.632,0.638,0.648,0.661,0.676,0.694,0.714,...,0.883,0.884,0.885,0.885,0.886,4.0,8.0,2990.0,3017.0,4946.0
30.0,2806.860107,3512.833984,0.841,0.84,0.841,0.841,0.841,0.839,0.837,0.838,...,0.963,0.963,0.961,0.961,0.961,4.0,103.0,3198.0,3938.0,4578.0


In [77]:
#выгрузка в файл результатов
data.to_csv('data/result4.csv',encoding='cp1251', sep=";")