In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
transform = transforms.ToTensor() 
# 이미지를 pytorch tensor 로 변환
mnist_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

data_loader = torch.utils.data.DataLoader(dataset=mnist_data, batch_size=64, shuffle=True) # 60000장의 data를 64 장씩 random 으로 뽑아서 총 937개가 있는 data -> data와 label 로 구성되어 있음

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [3]:
dataiter = iter(data_loader)
images, labels = dataiter.next() # batch size가 64인 data [64,1,28,28]
print(images.shape) # torch.Size([64, 1, 28, 28])
# 1개 image의 pixel값이 어떤 범위에 있는지 ,# output 출력할 때 활성화 함수 정하기 위해 필요
print(torch.min(images), torch.max(images)) 

torch.Size([64, 1, 28, 28])
tensor(0.) tensor(1.)


In [4]:
# nn.Sequential 짤 때, 콤마(,) 점(.) 으로 찍는 것 조심
class Autoencoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder=nn.Sequential(
        nn.Linear(28*28, 250),
        nn.ReLU(),
        nn.Linear(250,50),
        nn.ReLU(),
        nn.Linear(50,10)  # (N,10)
    )

    self.decoder=nn.Sequential(
        nn.Linear(10,50),
        nn.ReLU(),
        nn.Linear(50,250),
        nn.ReLU(),
        nn.Linear(250,784),
        nn.Sigmoid()  # 입력이 0-1 사이 이므로 출력도 0-1 사이가 되도록
    )

  # 복원된 image 구하기
  def forward(self,x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded # encoded 로 바꾸면 latent vector 출력 가능

  # embedding -> latent vector 구하기
  def encoding(self,x):
    encoded = self.encoder(x)
    return encoded

In [5]:
sample_data=images[0].view(-1,784)
# print(sample_data) # orginal image (1,784)

model = Autoencoder()
# print(model(sample_data)) # 생성자에 바로 data 붙어서 forward 함수 사용 가능한 듯, 재구성된 image (1,784)

encoded_data=model.encoding(sample_data) # encode된 data
print(encoded_data.shape) # torch.Size([1, 10]) 
print(encoded_data) # tensor([[-0.1001,  0.0028, -0.0790, -0.1277,  0.1153,  0.0227,  0.0345,  0.1823, 0.0391, -0.0742]], grad_fn=<AddmmBackward0>)

print(torch.min(encoded_data), torch.max(encoded_data))

print(mnist_data[0][0].shape)

torch.Size([1, 10])
tensor([[ 0.0263,  0.0992,  0.0767,  0.1316, -0.0487, -0.0814, -0.1255,  0.0209,
         -0.0517,  0.0976]], grad_fn=<AddmmBackward0>)
tensor(-0.1255, grad_fn=<MinBackward1>) tensor(0.1316, grad_fn=<MaxBackward1>)
torch.Size([1, 28, 28])


In [6]:
# heuristic 한 방식으로 label 값 찾기 -> mnist_data는 순서가 정해져 있음 
print(mnist_data[1][1]) # 0
print(mnist_data[3][1]) # 1
print(mnist_data[5][1]) # 2
print(mnist_data[7][1]) # 3
print(mnist_data[2][1]) # 4
print(mnist_data[0][1]) # 5
print(mnist_data[13][1]) # 6
print(mnist_data[15][1]) # 7
print(mnist_data[17][1]) # 8
print(mnist_data[4][1]) # 9

0
1
2
3
4
5
6
7
8
9


In [7]:
# heuristic 한 방식으로 초기 중심점 잡기
c_0 = model.encoder(mnist_data[1][0].view(-1,28*28)) #0
c_1 = model.encoder(mnist_data[3][0].view(-1,28*28)) #1
c_2 = model.encoder(mnist_data[5][0].view(-1,28*28)) #2
c_3 = model.encoder(mnist_data[7][0].view(-1,28*28)) #3
c_4 = model.encoder(mnist_data[2][0].view(-1,28*28)) #4
c_5 = model.encoder(mnist_data[0][0].view(-1,28*28)) #5
c_6 = model.encoder(mnist_data[13][0].view(-1,28*28)) #6
c_7 = model.encoder(mnist_data[15][0].view(-1,28*28)) #7
c_8 = model.encoder(mnist_data[17][0].view(-1,28*28)) #8
c_9 = model.encoder(mnist_data[4][0].view(-1,28*28)) #9

print(c_0)

tensor([[ 0.0218,  0.1035,  0.0608,  0.1057, -0.1053, -0.0573, -0.1347,  0.0132,
         -0.0673,  0.1021]], grad_fn=<AddmmBackward0>)


In [8]:
# 초기 중심값 선정방법 1 -> 0-9 까지의 label data 를 encoding 시킨 것 
init_centroids={}
for i in range(10):
  init_centroids['C{}'.format(i)]=0 # 딕셔너리 key 값 동적생성 
init_centroids['C0']=c_0
init_centroids['C1']=c_1
init_centroids['C2']=c_2
init_centroids['C3']=c_3
init_centroids['C4']=c_4
init_centroids['C5']=c_5
init_centroids['C6']=c_6
init_centroids['C7']=c_7
init_centroids['C8']=c_8
init_centroids['C9']=c_9
print(init_centroids)
# 초기 중심값 선정방법 2 -> 그냥 random 으로 중심점 선택

{'C0': tensor([[ 0.0218,  0.1035,  0.0608,  0.1057, -0.1053, -0.0573, -0.1347,  0.0132,
         -0.0673,  0.1021]], grad_fn=<AddmmBackward0>), 'C1': tensor([[-0.0063,  0.1273,  0.0814,  0.1232, -0.0574, -0.0529, -0.1086,  0.0151,
         -0.0774,  0.1027]], grad_fn=<AddmmBackward0>), 'C2': tensor([[-0.0178,  0.1639,  0.0729,  0.1073, -0.0927, -0.0580, -0.1164,  0.0575,
         -0.0828,  0.0947]], grad_fn=<AddmmBackward0>), 'C3': tensor([[ 0.0313,  0.1209,  0.0860,  0.1215, -0.0463, -0.0419, -0.1134,  0.0209,
         -0.0618,  0.1059]], grad_fn=<AddmmBackward0>), 'C4': tensor([[-0.0369,  0.1415,  0.0547,  0.1375, -0.0435, -0.0435, -0.0624,  0.0188,
         -0.0614,  0.1117]], grad_fn=<AddmmBackward0>), 'C5': tensor([[-0.0079,  0.1271,  0.0522,  0.0978, -0.0664, -0.0406, -0.1237,  0.0445,
         -0.0692,  0.0983]], grad_fn=<AddmmBackward0>), 'C6': tensor([[ 0.0154,  0.1441,  0.0553,  0.1037, -0.0744, -0.0661, -0.1160,  0.0383,
         -0.0881,  0.1060]], grad_fn=<AddmmBackward0>)

In [9]:
# 가장 가까운 key 값 return 하는 함수
# dictinary에 'C0' , 'C1' ... 이 KEY로 들어가고 value 가 tensor 로 만들고 정렬해서 최소값에 해당하는 key 뽑기
def closest_center(centroids, encoded_x):
  distances={}
  for key in centroids.keys():
    distance=torch.sum((centroids[key] - encoded_x)**2).item()
    distances[key]=distance
  # print(distances) # {'C0': 0.0, 'C1': 0.012900883331894875, 'C2': 0.006580686662346125 .... }
  return sorted(distances.items(), key=lambda x:x[1])[0][0]  # [('C3', 0.0001), ('CO', 0.01), ('C1', 0.2), ... ] 에서 첫 번째 튜플의 첫번 째 원소

print(closest_center(init_centroids, model.encoding(mnist_data[1][0].view(-1,28*28)))) # 'str' type

C0


In [10]:
print(mnist_data[200][1]) #4
A=model.encoding(mnist_data[200][0].view(-1,784))
B=init_centroids['C6']
print(A)
print(B)
print(A-B)
print(torch.sum((A-B)**2)) # 매번 갱신되는 cluster_loss 값
closest_center(init_centroids,model.encoding(mnist_data[200][0].view(-1,784)) )

1
tensor([[-0.0250,  0.1262,  0.0672,  0.1025, -0.0624, -0.0896, -0.1001,  0.0307,
         -0.0714,  0.0829]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0154,  0.1441,  0.0553,  0.1037, -0.0744, -0.0661, -0.1160,  0.0383,
         -0.0881,  0.1060]], grad_fn=<AddmmBackward0>)
tensor([[-0.0403, -0.0179,  0.0119, -0.0013,  0.0119, -0.0235,  0.0160, -0.0076,
          0.0166, -0.0231]], grad_fn=<SubBackward0>)
tensor(0.0039, grad_fn=<SumBackward0>)


'C9'

In [11]:
# 할당 dictionary 초기화
# label 당 개수로 확인하는 코드 -> 29초 걸림
# 차라리 index를 저장하는 방향으로 하면 더 나을것 같기도
init_assignments={}
for i in range(10):
  init_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

for key in init_assignments.keys():
  for i in range(10):
    init_assignments[key][i]=0 # 해당 키의 개수

print(init_assignments)

# 초기 x값들을 초기 중심에 할당 , 60000 개 학습하는데 30초
for img,label in mnist_data:
  new_img = img.view(-1,28*28)
  encoded_img=model.encoding(new_img)
  key = closest_center(init_centroids, encoded_img)

  dict = init_assignments[key] # 해당 key에 맞는 딕셔너리 불러옴
  dict[label]=dict[label]+1



{'C0': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C1': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C2': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C4': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C5': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C6': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C7': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C8': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C9': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}}


In [12]:
print(init_assignments)

{'C0': {0: 644, 1: 624, 2: 109, 3: 376, 4: 160, 5: 256, 6: 99, 7: 232, 8: 196, 9: 269}, 'C1': {0: 605, 1: 1688, 2: 1538, 3: 1052, 4: 1015, 5: 644, 6: 736, 7: 272, 8: 1049, 9: 1066}, 'C2': {0: 170, 1: 25, 2: 383, 3: 260, 4: 271, 5: 357, 6: 337, 7: 893, 8: 725, 9: 982}, 'C3': {0: 158, 1: 46, 2: 309, 3: 1504, 4: 117, 5: 332, 6: 108, 7: 30, 8: 441, 9: 141}, 'C4': {0: 153, 1: 1, 2: 65, 3: 15, 4: 893, 5: 104, 6: 138, 7: 82, 8: 139, 9: 132}, 'C5': {0: 322, 1: 345, 2: 1411, 3: 440, 4: 529, 5: 657, 6: 936, 7: 1854, 8: 245, 9: 995}, 'C6': {0: 2132, 1: 210, 2: 396, 3: 1303, 4: 967, 5: 1598, 6: 2712, 7: 514, 8: 1198, 9: 1153}, 'C7': {0: 45, 1: 1005, 2: 72, 3: 41, 4: 116, 5: 135, 6: 58, 7: 1471, 8: 32, 9: 314}, 'C8': {0: 219, 1: 227, 2: 865, 3: 377, 4: 487, 5: 402, 6: 246, 7: 484, 8: 1079, 9: 265}, 'C9': {0: 1475, 1: 2571, 2: 810, 3: 763, 4: 1287, 5: 936, 6: 548, 7: 433, 8: 747, 9: 632}}


In [None]:
# 할당 dictionary 초기화
# label 당 X 값들 (1,784) 을 저장하는 코드 -> 31초
init_assignments={}
for i in range(10):
  init_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

for key in init_assignments.keys():
  for i in range(10):
    init_assignments[key][i]=[]

print(init_assignments)

# 초기 x값들을 초기 중심에 할당 , 60000 개 학습하는데 30초
for img,label in mnist_data:
  new_img = img.view(-1,28*28)
  encoded_img=model.encoding(new_img)
  key = closest_center(init_centroids, encoded_img)

  dict = init_assignments[key] # 해당 key에 맞는 딕셔너리 불러옴
  dict[label].append(new_img)


{'C0': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C1': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C2': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C3': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C4': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C5': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C6': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C7': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C8': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C9': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}}


In [None]:
# 너무 data 많아서 출력 오류가 나는 듯함 -> 해결 필요
# print(init_assignments['C7'][7]) # len(init_assignments['C7'][7])=421

In [None]:
a=torch.tensor([[0.0,0.0,0.0,0.0]])
b=torch.tensor([[1.0,1.0,1.0,1.0]])
print((a+b)/2)

tensor([[0.5000, 0.5000, 0.5000, 0.5000]])


In [None]:
# 중심 update 하는 함수 -> 6초 걸림
def update_centroid(cluster_assignments, model):
  dict={}
  for key in cluster_assignments.keys():
    sum = torch.tensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    len=0
    for i in cluster_assignments[key].keys():
      for x in cluster_assignments[key][i]:
        sum = sum + model.encoding(x)
        len = len + 1
    dict[key]= sum / len
  return dict

# print(update_centroid(init_assignments,model))

In [None]:
# 새로운 중심에 x 들을 할당하는 함수
def assign_cluster(cluster_centroids, model):
  cluster_assignments={}
  for i in range(10):
    cluster_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

  for key in cluster_assignments.keys():
    for i in range(10):
      cluster_assignments[key][i]=[]

  for img,label in mnist_data:
    new_img = img.view(-1,28*28)
    encoded_data = model.encoding(new_img)
    key = closest_center(cluster_centroids, encoded_data)
    dict = cluster_assignments[key]
    dict[label].append(new_img)
  
  return cluster_assignments

In [None]:
# cluster loss update 하는 함수
def update_cluster_loss(cluster_centroids, cluster_assignments, model):
  cluster_loss=0.0
  for key in cluster_assignments.keys():
    centroid = cluster_centroids[key]
    for i in cluster_assignments[key].keys():
      for x in cluster_assignments[key][i]:
        encoded_data = model.encoding(x)
        cluster_loss = cluster_loss + torch.sum((encoded_data-centroid)**2).item()       
  
  return cluster_loss


  

In [None]:
# 각 중심마다 얼마나 잘 모여있는지 확인하기 위한 함수
def print_accuracy(cluster_assignments):
  cluster_accuracy = {}
  index=0
  for key in cluster_assignments.keys():
    sum=0
    for i in cluster_assignments[key].keys():
      sum += len(cluster_assignments[key][i])
    print(sum)
    cluster_accuracy[key] = len(cluster_assignments[key][index]) / sum
    index += 1
  for key,value in cluster_accuracy.items():
    print(f'{key}:{value:.4f}', end='  ')
  print("\n")

In [None]:
print_accuracy(init_assignments)
sum2=len(init_assignments['C1'][0]) + len(init_assignments['C1'][1]) +len(init_assignments['C1'][2])+len(init_assignments['C1'][3])+len(init_assignments['C1'][4])+len(init_assignments['C1'][5])+len(init_assignments['C1'][6])+len(init_assignments['C1'][7])+len(init_assignments['C1'][8]) + len(init_assignments['C1'][9])
print(len(init_assignments['C1'][1]))
print(sum2)

3456
12066
14186
8566
2217
1695
3003
10701
2844
1266
C0:0.4514  C1:0.1935  C2:0.1200  C3:0.2782  C4:0.2233  C5:0.1546  C6:0.2118  C7:0.1354  C8:0.1639  C9:0.2077  

2335
12066


In [None]:
 # Training 과정 -> 함수화
 # loss = reconstruct_error + cluster_loss 로 일단 함
 # 1. 1epoch 동안 신경망의 매개변수 update
 # 2. 전체 훈련 data 60000 장에 대해서 clustering 진행 -> 중심 좌표들을 update
 # 3. 전체 훈련 data 에 대해서 가장 가까운 중심에 할당시킴
 # 4. 1,2,3 과정을 T (hyperparameter) 동안 반복

# autoencoder 객체 , Loss 종류, Optimizer 종류, lambda
def train(model, criterion, optimizer, T, cluster_loss, cluster_centroids, cluster_assignments):
    for t in range(T): # T번 반복
        # 1. 1epoch 동안 신경망의 매개변수 update
        for img,labels in data_loader:
            img = img.view(-1,28*28) # torch.Size([64, 784])
            recon = model.forward(img)
            MSELoss= criterion(img,recon)
            loss =  MSELoss + cluster_loss

            optimizer.zero_grad() # 배치마다 gradient 를 0으로 초기화
            loss.backward()
            optimizer.step()
      
        # 2. 전체 훈련 data 60000 장에 대해서 clustering 진행 -> 중심 좌표들을 update
        cluster_centroids = update_centroid(cluster_assignments, model) 

        # 3. update 된 중심 좌표에 x 값들 할당
        cluster_assignments = assign_cluster(cluster_centroids, model)

        # 4. clustering loss update
        cluster_loss = update_cluster_loss(cluster_centroids, cluster_assignments, model)

        # MSE loss 와 cluster loss (오토인코더에 의한 loss 와 clustering에 의한 loss)
        print(f'Iteration:{t+1}, Loss:{MSELoss:.4f}, cluster_loss:{cluster_loss}\n')

        # 각 label에 클러스터링이 잘 되는지 확인
        print_accuracy(cluster_assignments)
  
    return cluster_assignments


In [None]:
# T=3 일 때 3분 10초 걸림
criterion = nn.MSELoss() # Mean Squared Error 가 Loss function이 되도록 함 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3) # parameter를 갱신하는 것을 optimizer라고 함
assign = train(model,criterion, optimizer, 10, 0, init_centroids, init_assignments)


Iteration:1, Loss:0.0710, cluster_loss:296903.55489623704

C0:0.0856  C1:0.1151  C2:0.1194  C3:0.1127  C4:0.1033  C5:0.0347  C6:0.0836  C7:0.1163  C8:0.0994  C9:0.1060  

Iteration:2, Loss:0.0737, cluster_loss:137938.15799771808

C0:0.1036  C1:0.1386  C2:0.0891  C3:0.1209  C4:0.0880  C5:0.0421  C6:0.0808  C7:0.0992  C8:0.0901  C9:0.1137  

Iteration:3, Loss:0.0662, cluster_loss:139595.04762582207

C0:0.0988  C1:0.1286  C2:0.1134  C3:0.1078  C4:0.0996  C5:0.0375  C6:0.0828  C7:0.1008  C8:0.0979  C9:0.1112  

Iteration:4, Loss:0.0674, cluster_loss:113058.23913255916

C0:0.1092  C1:0.1283  C2:0.1149  C3:0.1129  C4:0.0844  C5:0.0349  C6:0.0878  C7:0.1102  C8:0.0955  C9:0.1196  

Iteration:5, Loss:0.0693, cluster_loss:77391.69517755575

C0:0.1304  C1:0.1246  C2:0.1091  C3:0.1196  C4:0.0876  C5:0.0348  C6:0.0740  C7:0.0983  C8:0.0936  C9:0.1234  



KeyboardInterrupt: ignored

In [None]:
# 현재 상황
# 중심좌표에 할당되는 값들이 전혀 update가 되지 않고 있다
# 더불어 cluster_loss 가 오히려 계속 증가하고 있는 상황
