7/24

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

import seaborn as sns
import missingno as msno

from sklearn.preprocessing import StandardScaler

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \
    TimeSeriesResampler

In [None]:
'''
명목 변수 조합 :
[UPPER 분류] -> 1 로 분류
VVVF_SDR_ENPOW, VVVF_SDR_PoweringMode, VVVF_SDR_P, VVVF_SD_P

[Constant 분류] -> 0으로 분류
VVVF_SD_GATEON, VVVF_BLD_CUR_Valid

[Lower 분류] -> 1로 분류
VVVF_SDR_BrakingMode, VVVF_SD_CDR, VVVF_SD_B
'''

sel_cols = ['VVVF_SDR_ENPOW', 'VVVF_SDR_PoweringMode', 'VVVF_SDR_P', 'VVVF_SD_P', 'VVVF_SD_GATEON', 
'VVVF_BLD_CUR_Valid', 'VVVF_SDR_BrakingMode', 'VVVF_SD_CDR', 'VVVF_SD_B', 'VVVF_SDR_ATS_SPEED', 'dDate']

path = 'c:\\Users\\jaeju\\vscode\\Onepredict\\rotem\\Data\\train3_1.csv' # 2023년도 4월 2주차
scaler = StandardScaler()
df = pd.read_csv(path)
df['dDate'] = pd.to_datetime(df['dDate'])
# df['VVVF_SDR_ATS_SPEED'] = scaler.fit_transform(df[['VVVF_SDR_ATS_SPEED']]) # 속도 스케일링

df2 = df[sel_cols]

In [None]:
# 값이 0에서 다른 값으로 변하는 순간의 인덱스 찾기
change_to_nonzero = df2['VVVF_SDR_ATS_SPEED'].ne(0) & df2['VVVF_SDR_ATS_SPEED'].shift().eq(0)
index_to_nonzero = df2.index[change_to_nonzero].tolist()

# 값이 다른 값에서 0으로 변하는 순간의 인덱스 찾기
change_to_zero = df2['VVVF_SDR_ATS_SPEED'].eq(0) & df2['VVVF_SDR_ATS_SPEED'].shift().ne(0)
index_to_zero = df2.index[change_to_zero].tolist()
del(index_to_zero[0])

In [None]:
pca_df = {}
for idx, (start, end) in enumerate(zip(index_to_nonzero, index_to_zero)):
    end_start = end - start
        
    if 60 <= end_start < 200 :
        pca_df[idx] = df2[start : end +1]
         
pca_df = {new_idx: value for new_idx, (old_idx, value) in enumerate(pca_df.items())} # 딕셔너리 키 값 재할당

'''
upper 분류 -> lower 분류
label 값 변경
'''
def set_label(row):
    if row['VVVF_SDR_PoweringMode'] == 1:
        return 1
    elif row['VVVF_SD_CDR'] == 1:
        return 3
    else:
        return 2
    
for i in range(len(pca_df)): 
    pca_df[i]['label'] = 0
    pca_df[i]['label'] = pca_df[i].apply(set_label, axis=1)
    

In [None]:
padded_speed_lst = []

for i in range(len(pca_df)):
    df_ = pca_df[i]
    df_speed = df_['VVVF_SDR_ATS_SPEED'].values

    padded = np.pad(df_speed, 100)
    
    # print(padded.shape)
    
    if len(padded[int(np.ceil(padded.shape[0]/2)) - 100 :int(padded.shape[0]/2) + 100]) == 200:
        paded_speed = padded[int(np.ceil(padded.shape[0]/2)) - 100 :int(padded.shape[0]/2) + 100]
        padded_speed_lst.append(paded_speed.reshape(1,-1,1))
    else:
        paded_speed = padded[int(np.ceil(padded.shape[0]/2)) - 100 :int(padded.shape[0]/2) + 101]
        padded_speed_lst.append(paded_speed.reshape(1,-1,1))
        
    # print(paded_speed.shape)
    # print()
   
X_train = np.concat(padded_speed_lst,axis=0)


In [None]:
seed = 0
np.random.shuffle(X_train)
# Keep only 50 time series
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)
# Make time series shorter
X_train = TimeSeriesResampler(sz=100).fit_transform(X_train)
sz = X_train.shape[1]

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=4, verbose=True, random_state=seed)
y_pred = km.fit_predict(X_train)


plt.figure(figsize = (30,12))
for yi in range(4):
    plt.subplot(4, 4, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")
        
        
# DBA-k-means
print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=4,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)
y_pred = dba_km.fit_predict(X_train)

for yi in range(4):
    plt.subplot(4, 4, 5 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("DBA $k$-means")

# Soft-DTW-k-means
print("Soft-DTW k-means")
sdtw_km = TimeSeriesKMeans(n_clusters=4,
                           metric="softdtw",
                           metric_params={"gamma": .01},
                           verbose=True,
                           random_state=seed)
y_pred = sdtw_km.fit_predict(X_train)

for yi in range(4):
    plt.subplot(4, 4, 9 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Soft-DTW $k$-means")

plt.tight_layout()
plt.show()


In [None]:
for i in range(len(X_train[y_pred == 0])):
    plt.plot(X_train[y_pred == 0][i,:,0])

In [None]:
for i in range(len(X_train[y_pred == 1])):
    plt.plot(X_train[y_pred == 1][i,:,0])

In [None]:
for i in range(len(X_train[y_pred == 2])):
    plt.plot(X_train[y_pred == 2][i,:,0])