In [None]:
#DTW 클러스터링
#클러스터링에 활용할 변수 단위 차이 파악
stock_info['daily_mov'].describe()

In [None]:
trade_info['tot_hld_qty'].describe()

In [None]:
trade_info['pft_ivo_rt'].describe()

In [None]:
#총보유수량이 다른 변수에 비해 단위 차이가 큰 것을 확인, 단위 차이를 유사한 수준으로 조정해주기 위해 로그 변환 진행
trade_info['tot_hld_qty_lg']=np.log1p(trade_info['tot_hld_qty'])

In [None]:
#클러스터 분석을 위해 피벗테이블 형태로 변환
stock1 =trade_info.pivot(index='bse_dt', columns='tck_iem_cd', values='pft_ivo_rt')
stock2 =stock_info.pivot(index='bse_dt', columns='tck_iem_cd', values='daily_mov')
stock3 =trade_info.pivot(index='bse_dt', columns='tck_iem_cd', values='tot_hld_qty_lg')

In [None]:
stocks = [stock1, stock2, stock3]

In [None]:
#노이즈를 제거하기 위해 5일기준 이동평균의 값을 이용
stock=trade_info[['tck_iem_cd','bse_dt','tot_hld_qty','pft_ivo_rt']]
stock=pd.merge(stock,stock_info[['tck_iem_cd','bse_dt','daily_mov']],on=['bse_dt','tck_iem_cd'],how='inner')

x_list = stock['tck_iem_cd'].unique()

for var in stocks:
    for stock in x_list:
        var[stock] = var[stock].rolling(window=5).mean()
    var.dropna(inplace=True)

In [None]:
#결측치 확인
stocks = [stock1,stock2, stock3]

for i, stock in enumerate(stocks, 1):
    print(f"Missing values in stock{i}:")
    print(stock.isna().sum().sum())
    print()

In [None]:
#정규화 진행
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled1 = scaler.fit_transform(stock1.T)
scaled2 = scaler.fit_transform(stock2.T)
scaled3 = scaler.fit_transform(stock3.T)

In [None]:
#종합 데이터셋 생성
scaled_total = np.dstack([scaled1, scaled2, scaled3])

In [None]:
from tslearn.clustering import TimeSeriesKMeans
from sklearn.metrics import silhouette_score

In [None]:
# 실루엣 점수 계산 함수 정의
def calculate_dtw_clusters(data, min_clusters, max_clusters):
    wcss = []  
    silhouette_scores = [] 
    for n_clusters in range(min_clusters, max_clusters + 1):
        km_dtw = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=31, n_jobs=-1)
        cluster_labels = km_dtw.fit_predict(data)
        wcss.append(km_dtw.inertia_)
        sil_score = silhouette_score(data.reshape(data.shape[0], -1), cluster_labels)
        silhouette_scores.append(sil_score)
    return wcss, silhouette_scores

In [None]:
#WCSS 기반 Elbow Point Plot 함수 정의
def plot_elbow_silhouette(wcss, min_clusters, max_clusters):
    x = range(min_clusters, max_clusters + 1)
    plt.figure(figsize=(12, 6))
    plt.plot(x, wcss, marker='o')
    plt.title('Elbow Method (WCSS)')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.tight_layout()
    plt.show()

In [None]:
# 최소/최대 군집 개수 설정
min_clusters = 2
max_clusters = 10

#클러스터 실행
wcss, silhouette_scores = calculate_dtw_clusters(scaled_total, min_clusters, max_clusters)

In [None]:
#실루엣계수 확인
silhouette_scores

In [None]:
#WCSS 확인
wcss

In [None]:
#Elbow point 확인
plot_elbow_silhouette(wcss, 2, 10)

In [None]:
#5개 클러스터로 최종 클러스터링 진행
n_clusters = 5
km_dtw = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=31, n_jobs=-1)
cluster_labels = km_dtw.fit_predict(scaled_total)

In [None]:
#클러스터 안에 포함된 주식 종목 개수 확인
for i in range(n_clusters):
    print(f"Cluster {i}: {len(np.where(cluster_labels == i)[0])}")

In [None]:
#클러스터링 결과 데이터셋 생성
result=pd.DataFrame(cluster_labels)
result.columns=['cluster']

In [None]:
stock_nm=pd.DataFrame(stock_info['tck_iem_cd'].unique())
stock_nm.columns=['tck_iem_cd']

In [None]:
result=pd.concat([stock_nm,result],axis=1)

In [None]:
#대시보드(시각화)에 사용할 데이터 추출
stock_info=stock_info[['tck_iem_cd','bse_dt','daily_mov']]
trade_info=trade_info[['tck_iem_cd','bse_dt','tot_hld_qty','pft_ivo_rt']]
info_df=pd.merge(stock_info,trade_info,how='inner',on=['tck_iem_cd','bse_dt'])
result.to_excel('result of clusters.xlsx', encoding='cp949', index=False)
info_df.to_excel('information of stocks.xlsx',encoding='cp949',index=False)