## 하이퍼파라미터 정리
1. test data에 있는 정상데이터 수
2. batch_size
3. latent_size, cluster 수

In [105]:
import torch
import torchvision
import torch.nn as nn
from torchvision import transforms
import torchvision.transforms as tr
import numpy as np
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment as linear_assignment

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device} is available')

cpu is available


In [106]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [107]:
from scipy import io
cardio = io.loadmat('/content/drive/MyDrive/캡스톤/dataset/cardio.mat')
cardio_X= pd.DataFrame(cardio['X'])
cardio_label = cardio['y']
print(cardio_X.shape)
print(type(cardio_label))
print(cardio_label.shape)

(1831, 21)
<class 'numpy.ndarray'>
(1831, 1)


In [108]:
# 정상/비정상 데이터 나누기
cardio_normal = cardio_X[cardio_label == 0] # dataframe
cardio_abnormal = cardio_X[cardio_label == 1] # 비정상인 data들 (dataframe 형태)

print(cardio_normal.shape) 
print(cardio_abnormal.shape)


(1655, 21)
(176, 21)


In [123]:
# train/test data 나누기
norm_idx = np.random.choice(1655,1479,replace=False) # 비복원추출, 정상에서 1479개 뽑기 -> train
test_normal_index = np.delete(np.arange(1655),norm_idx) # 정상에서 나머지 176개 뽑기 -> test

train_data = cardio_normal.iloc[norm_idx] # dataframe
test_data = pd.concat([cardio_normal.iloc[test_normal_index],cardio_abnormal]) # 정상인 data 절반, 비정상인 data 절반, dataframe
test_normal_data = cardio_normal.iloc[test_normal_index] # train 과정에서 clustering 잘 되는지 확인해보기 위함, dataframe
test_label = np.concatenate([np.zeros(176),np.ones(176)]) # numpy array 

print(train_data.shape)
print(test_data.shape)
print(test_label.shape)

(1479, 21)
(352, 21)
(352,)


In [110]:
# train/test data 정규화 -> pandas가 numpy 로 바뀜
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data=scaler.fit_transform(train_data)

scaler2 = StandardScaler()
test_data = scaler2.fit_transform(test_data)

In [111]:
print(type(train_data))
print(train_data.shape)

<class 'numpy.ndarray'>
(1479, 21)


## padas data를 tensor로 바꾸기 (Autoencoder를 위해서)

In [112]:
class toTensorTrain(Dataset):

  def __init__(self,x_data):
    self.x_data = torch.FloatTensor(x_data)
    self.len = len(self.x_data)

  def __getitem__(self,index):
    return self.x_data[index]

  def __len__(self):
    return self.len



In [113]:
class toTensorTest(Dataset):

  def __init__(self,x_data,y_data):
    self.x_data = torch.FloatTensor(x_data)
    self.y_data = torch.LongTensor(y_data)
    self.len = len(self.x_data)

  def __getitem__(self,index):
    return self.x_data[index], self.y_data[index]

  def __len__(self):
    return self.len



In [114]:
print(type(train_data))
print(type(test_data))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [115]:
# hyperprameter : batch_size
# train data : numpy -> tensor 바뀜
# test data : numpy -> tensor 바뀜
train_data = toTensorTrain(train_data)
test_data = toTensorTest(test_data,test_label)

train_loader = DataLoader(train_data,batch_size=32,shuffle=True)
test_loader = DataLoader(test_data,batch_size=32,shuffle=True)

train_batch = iter(train_loader).next()
print(train_batch.shape)
test_batch, test_batch_labels = iter(test_loader).next()
print(test_batch.shape, test_batch_labels.shape)

torch.Size([32, 21])
torch.Size([32, 21]) torch.Size([32])


#Deep K-means
21차원 data를 2차원으로 줄이기


In [117]:
class Encoder(nn.Module):
  def __init__(self,latent_size):
    super(Encoder,self).__init__()

    self.encoder = nn.Sequential(
                      nn.Linear(21,16),
                      nn.ReLU(),
                      nn.Linear(16,8),
                      nn.ReLU(),
                      nn.Linear(8,4),
                      nn.ReLU(),
                      nn.Linear(4,latent_size),
                      nn.ReLU())
    
  def forward(self, x):
    return self.encoder(x)

class Decoder(nn.Module):
  def __init__(self, latent_size):
    super(Decoder,self).__init__()

    self.decoder = nn.Sequential(
                      nn.Linear(latent_size,4),
                      nn.ReLU(),
                      nn.Linear(4,8),
                      nn.ReLU(),
                      nn.Linear(8,16),
                      nn.ReLU(),
                      nn.Linear(16,21),
                      nn.Sigmoid())
    
  def forward(self,x):
    return self.decoder(x)
    

In [118]:
class Kmeans(nn.Module):
  def __init__(self,num_clusters, latent_size):
    super(Kmeans,self).__init__()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.num_clusters = num_clusters
    self.centroids = nn.Parameter(torch.rand((self.num_clusters,latent_size)).to(device))

  def argminl2distance(self,a,b):
    return torch.argmin(torch.sum((a-b)**2,dim=1),dim=0)

  def forward(self,x):
    y_assign = []
    for m in range(x.size(0)):
      h = x[m].expand(self.num_clusters,-1)
      assign = self.argminl2distance(h, self.centroids)
      y_assign.append(assign.item())
    return y_assign, self.centroids[y_assign]

In [119]:
def cluster_acc(y_true,y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  D = max(y_pred.max(), y_true.max()) + 1
  w = np.zeros((D,D), dtype = np.int64)
  for i in range(y_pred.size):
    w[y_pred[i],y_true[i]] +=1
  ind = linear_assignment(w.max() - w)
  return sum([w[i,j] for i,j in zip(ind[0],ind[1])]) * 1.0 / y_pred.size

In [120]:
def evaluation(testloader, encoder, kmeans, device):
  predictions = []
  actual = []

  with torch.no_grad():
    for images, labels in testloader:
      inputs = images.to(device)
      labels = labels.to(device)
      latent_var = encoder(inputs)
      y_pred,_ = kmeans(latent_var)

      predictions+=y_pred
      actual+=labels.cpu().tolist()

  return cluster_acc(actual,predictions)

## 모델 학습

In [121]:
latent_size=2
num_clusters=2

In [122]:
encoder = Encoder(latent_size).to(device)
decoder = Decoder(latent_size).to(device)
kmeans = Kmeans(num_clusters, latent_size).to(device)
criterion1 = torch.nn.MSELoss()
criterion2 = torch.nn.MSELoss()
optimizer = torch.optim.Adam(list(encoder.parameters())+list(decoder.parameters())+list(kmeans.parameters()), lr=0.001)


In [124]:
T1=100
T2=200
lam = 1e-3
ls = 0.05

In [125]:
for ep in range(300):
  if ep <= T1:
    alpha = lam/(T2-T1)
  else:
    alpha = lam
  
  # 정상 data(label=0)에 대해서만 학습
  running_loss = 0.0
  for batch in train_loader:
    inputs = batch.to(device)
    optimizer.zero_grad()
    latent_var = encoder(inputs)
    _, centroids = kmeans(latent_var.detach())
    outputs = decoder(latent_var)

    l_rec = criterion1(inputs,outputs)
    l_clt = criterion2(latent_var,centroids)
    loss = l_rec + alpha*l_clt

    loss.backward()
    optimizer.step()
    running_loss+=loss.item()
  
  ###
  avg_loss = running_loss/len(train_loader)

  if(ep%10==0):
    print('[%d] Train loss:%.4f' %(ep,avg_loss) )

  # if(ep%10==0):
  #   ###
  #   testacc=evaluation(clu_tes_loader,encoder,kmeans,device)
  #   print('[%d] Train loss: %.4f, Test Accuracy:%.3f' %(ep,avg_loss,testacc)) # 10의 배수인 epoch마다 avg_loss 를 print

  # # ls update -> 기존의 loss(hyper parameter) 보다 작게되면 그때의 신경망, kmeans의 매개변수들을 save
  # # 굳이 메 epoch마다 하는 이유는? 다 끝나고 나서 저장해도 되지 않는가?
  # if avg_loss < ls:
  #   ls=avg_loss

  #   torch.save(encoder.state_dict(),'./models/dkm_en.pth')
  #   torch.save(decoder.state_dict(),'./models/dkm_de.pth')
  #   torch.save(kmeans.state_dict(),'./models/dkm_clt.pth') # kmeans 매개변수 : 클러스터 개수와 중심들

[0] Train loss:1.2281
[10] Train loss:0.9238
[20] Train loss:0.8836
[30] Train loss:0.8641
[40] Train loss:0.8543
[50] Train loss:0.8415
[60] Train loss:0.8312
[70] Train loss:0.8159
[80] Train loss:0.8030
[90] Train loss:0.7989
[100] Train loss:0.7963
[110] Train loss:0.7993
[120] Train loss:0.7967
[130] Train loss:0.7940
[140] Train loss:0.7931
[150] Train loss:0.7884
[160] Train loss:0.7884
[170] Train loss:0.7861
[180] Train loss:0.7862
[190] Train loss:0.7930
[200] Train loss:0.7897
[210] Train loss:0.7859
[220] Train loss:0.7854
[230] Train loss:0.7854
[240] Train loss:0.7803
[250] Train loss:0.7833
[260] Train loss:0.7852
[270] Train loss:0.7783
[280] Train loss:0.7784
[290] Train loss:0.7880
