In [1]:
#импорт библиотек
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.semi_supervised import LabelSpreading
from joblib import Parallel, delayed
from dtaidistance import dtw

Установка dtaidistance:
* pip install dtaidistance  
__ИЛИ__
* conda install -c conda-forge dtaidistance

# Часть 1

In [2]:
#количество каналов сигналов
samples_count = 5000

#формирование заголовков столбцов таблицы
c = ['name', 'x', 'y']
for i in range(0, samples_count):
    c.append(f'v{i}')
c = c + ['cluster', 'p0', 'p1', 'p2', 'p3']

In [3]:
# загрузка данных из файла и замена индексов
data = pd.read_csv('signals.csv', names=c, dtype=np.float32)
data = data.set_index('name', drop=True)

In [4]:
#выделение из таблицы части данных с каналами для дальнейшего присвоения кластера
data_train = data.loc[:, :'v4999']
labels = data['cluster']

In [5]:
# обучение модели LabelSpreading для присвоения кластеров
label_spread = LabelSpreading(kernel="knn", n_neighbors=30, alpha=0.1, max_iter=150)
label_spread.fit(data_train, labels)

LabelSpreading(alpha=0.1, kernel='knn', max_iter=150, n_neighbors=30)

In [6]:
#присвоение кластеров
data_new = data.loc[:, :'v4999']
data_new['cluster'] = label_spread.transduction_

  data_new['cluster'] = label_spread.transduction_


In [7]:
#группировка по кластерам
data_new_clus = data_new.groupby('cluster')

In [8]:

"""
визуализация результата

fig, ax = plt.subplots(figsize=(20, 15))
data_new_clus.plot(x='x', y='y', style='.', ax=ax, markersize=14)
ax.legend(data_new_clus.groups.keys())

"""

"\nвизуализация результата\n\nfig, ax = plt.subplots(figsize=(20, 15))\ndata_new_clus.plot(x='x', y='y', style='.', ax=ax, markersize=14)\nax.legend(data_new_clus.groups.keys())\n\n"

In [9]:
data['cluster'] = label_spread.transduction_

# Часть 2

In [10]:
data_cluster = data.groupby('cluster')

In [11]:
#создание DataFrame для поиска ближайщих точек
data1 = data.copy()
data1[['p0_name', 'p1_name', 'p2_name', 'p3_name']] = 0.0

In [12]:
for cluster_num, group_cluster in data_cluster:
    """
    
    функция поиска ближайщих размеченных точек для каждой неразмеченной
    по каждому параметру ['p0', 'p1', 'p2', 'p3'] для неразмеченной точки из non_labeled происходит поиск ближайщей размеченной точки из labeled
    на вход передаются кластеры

    """
    
    for p in ['p0', 'p1', 'p2', 'p3']:
        labeled = group_cluster[group_cluster[p] != -1]
        labeled_x_y = labeled[['x', 'y']].values

        non_labeled = group_cluster[group_cluster[p] == -1]
        non_labeled_x_y = non_labeled[['x', 'y']].values

        distance_matrix = cdist(labeled_x_y, non_labeled_x_y)
        index = distance_matrix.argmin(axis=0)

        labeled_names = labeled.index[index]
        non_labeled_names = non_labeled.index

        data1.loc[non_labeled.index, p + '_name'] = labeled_names

In [13]:
def cluster_work(data1, p):

    """
    
    реализация функции dfw (функция расположена в коде)
    для всех точек по параметрам ['p0', 'p1', 'p2', 'p3']

    в функцию передается DataFrame и параметр
    
    """
    list_value = list()

    for index, row in data1.iterrows():
        if row[p + '_name'] != 0: #если запись не размечена, то идет вычисление параметра
            
            #х относится к разчеменному образцу, у - к неразмеченному
            x = data1.loc[row[p+'_name'], 'v0':'v4999'].values
            y = row['v0':'v4999'].values

            path = dtw.warping_path_fast(x, y) #Толстый Bottleneck был здесь
            a, b = zip(*path)
            x_labeled_new = np.where(a == data1.loc[row[p + '_name'], p])[0][0]
            
            list_value.append((b[x_labeled_new] , index, p))

    return list_value

In [14]:
# Дополнительное распараллеливание вычислений параметров p0, p1, p2, p3 чтобы уменьшить время с ~8.5 минут до ~2.5 минут
result_lists = Parallel(n_jobs=5)(delayed(cluster_work)(data1, p) for p in ['p0', 'p1', 'p2', 'p3'])

In [15]:
result_lists

[[(0, 168.0, 'p0'),
  (0, 307.0, 'p0'),
  (0, 668.0, 'p0'),
  (382, 229.0, 'p0'),
  (633, 316.0, 'p0'),
  (0, 711.0, 'p0'),
  (1405, 324.0, 'p0'),
  (0, 97.0, 'p0'),
  (1389, 730.0, 'p0'),
  (0, 190.0, 'p0'),
  (0, 583.0, 'p0'),
  (2604, 725.0, 'p0'),
  (596, 60.0, 'p0'),
  (0, 390.0, 'p0'),
  (3427, 362.0, 'p0'),
  (563, 154.0, 'p0'),
  (0, 480.0, 'p0'),
  (141, 314.0, 'p0'),
  (745, 347.0, 'p0'),
  (237, 343.0, 'p0'),
  (2674, 570.0, 'p0'),
  (810, 726.0, 'p0'),
  (816, 9.0, 'p0'),
  (0, 404.0, 'p0'),
  (629, 244.0, 'p0'),
  (1825, 415.0, 'p0'),
  (253, 408.0, 'p0'),
  (4931, 723.0, 'p0'),
  (1264, 492.0, 'p0'),
  (2107, 155.0, 'p0'),
  (3377, 24.0, 'p0'),
  (0, 323.0, 'p0'),
  (0, 641.0, 'p0'),
  (2701, 39.0, 'p0'),
  (0, 146.0, 'p0'),
  (2005, 76.0, 'p0'),
  (27, 211.0, 'p0'),
  (1101, 513.0, 'p0'),
  (759, 728.0, 'p0'),
  (110, 584.0, 'p0'),
  (0, 444.0, 'p0'),
  (2652, 690.0, 'p0'),
  (0, 419.0, 'p0'),
  (0, 214.0, 'p0'),
  (0, 378.0, 'p0'),
  (4824, 149.0, 'p0'),
  (1340, 546.0,

In [16]:
for сlus in data1['cluster'].unique():
    print(сlus, data1[data1['cluster'] == сlus].shape[0])

7.0 113
2.0 39
6.0 62
3.0 104
0.0 169
5.0 126
8.0 87
1.0 31
4.0 29


In [17]:
for lists in result_lists:
    for value, index, parametr in lists:
        data.loc[data.index == index, parametr] = value

In [18]:
data[data['p0'] == 0]

Unnamed: 0_level_0,x,y,v0,v1,v2,v3,v4,v5,v6,v7,...,v4995,v4996,v4997,v4998,v4999,cluster,p0,p1,p2,p3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
168.0,3642.316895,-107.175003,0.558,0.595,0.519,0.742,0.664,0.467,0.426,0.448,...,0.606,0.523,0.607,0.629,0.664,7.0,0.0,2494.0,2773.0,4999.0
307.0,4034.298096,-668.197998,0.650,0.457,0.572,0.644,0.581,0.525,0.645,0.515,...,0.397,0.579,0.636,0.570,0.530,7.0,0.0,2321.0,2505.0,4999.0
668.0,4358.000977,1703.198975,0.638,0.621,0.646,0.495,0.510,0.611,0.574,0.601,...,0.569,0.587,0.541,0.412,0.519,7.0,0.0,2348.0,2526.0,4999.0
711.0,4382.296875,99.473000,0.468,0.525,0.529,0.503,0.642,0.538,0.515,0.606,...,0.429,0.587,0.557,0.587,0.504,7.0,0.0,2412.0,2620.0,4999.0
97.0,3526.093018,-54.159000,0.601,0.482,0.543,0.533,0.621,0.662,0.499,0.505,...,0.569,0.439,0.442,0.390,0.423,7.0,0.0,2439.0,2628.0,4999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198.0,4303.292969,-1085.218018,0.489,0.605,0.575,0.431,0.636,0.511,0.442,0.626,...,0.465,0.483,0.530,0.351,0.492,8.0,0.0,2608.0,2745.0,4573.0
220.0,3383.717041,-1597.626953,0.633,0.573,0.532,0.607,0.638,0.503,0.533,0.468,...,0.447,0.467,0.508,0.451,0.366,8.0,0.0,2450.0,2615.0,4682.0
273.0,4229.293945,1247.447021,0.474,0.559,0.625,0.515,0.446,0.506,0.387,0.487,...,0.530,0.777,0.662,0.604,0.468,7.0,0.0,2270.0,2495.0,4999.0
498.0,2648.039062,-1611.817993,0.633,0.550,0.548,0.531,0.621,0.565,0.628,0.466,...,0.515,0.642,0.373,0.517,0.520,8.0,0.0,2334.0,2627.0,4721.0


In [19]:
data[data['cluster'] == 0]

Unnamed: 0_level_0,x,y,v0,v1,v2,v3,v4,v5,v6,v7,...,v4995,v4996,v4997,v4998,v4999,cluster,p0,p1,p2,p3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
583.0,2753.613037,879.413025,0.893,0.898,0.903,0.908,0.912,0.916,0.918,0.920,...,0.980,0.981,0.982,0.982,0.982,0.0,0.0,0.0,2592.0,3395.0
728.0,1031.917969,2098.483887,0.704,0.707,0.710,0.715,0.719,0.714,0.709,0.701,...,0.860,0.857,0.855,0.851,0.847,0.0,759.0,3711.0,4398.0,4512.0
584.0,2729.747070,683.119019,0.592,0.580,0.565,0.548,0.531,0.515,0.502,0.491,...,0.979,0.979,0.978,0.979,0.984,0.0,110.0,467.0,3605.0,4255.0
3.0,1945.437012,1039.363037,0.720,0.725,0.730,0.735,0.741,0.748,0.749,0.751,...,0.998,1.000,1.000,1.000,0.999,0.0,1162.0,2838.0,3573.0,4195.0
113.0,1745.366943,1135.827026,0.811,0.811,0.815,0.824,0.837,0.852,0.862,0.865,...,0.758,0.753,0.745,0.749,0.751,0.0,0.0,173.0,62.0,4991.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261.0,2442.956055,449.592987,0.561,0.568,0.575,0.581,0.586,0.590,0.594,0.598,...,0.994,0.996,0.998,0.999,0.998,0.0,469.0,3447.0,3808.0,4098.0
27.0,1492.500977,1804.232056,0.600,0.612,0.625,0.637,0.647,0.658,0.667,0.674,...,0.983,0.984,0.984,0.984,0.985,0.0,1699.0,4565.0,4845.0,4882.0
557.0,1379.959961,1532.691040,0.583,0.591,0.602,0.614,0.627,0.637,0.647,0.656,...,0.991,0.990,0.990,0.991,0.993,0.0,683.0,3739.0,4356.0,4070.0
550.0,1183.859009,1250.806030,0.692,0.705,0.713,0.716,0.718,0.720,0.721,0.722,...,0.982,0.983,0.983,0.984,0.984,0.0,866.0,3813.0,4500.0,4371.0


In [20]:
#выгрузка в файл результатов
data.to_csv('results.csv', encoding='cp1251', sep=";")