In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
transform = transforms.ToTensor() 
# 이미지를 pytorch tensor 로 변환
mnist_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

data_loader = torch.utils.data.DataLoader(dataset=mnist_data, batch_size=64, shuffle=True) # 60000장의 data를 64 장씩 random 으로 뽑아서 총 937개가 있는 data -> data와 label 로 구성되어 있음

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [3]:
dataiter = iter(data_loader)
images, labels = dataiter.next() # batch size가 64인 data [64,1,28,28]
print(images.shape) # torch.Size([64, 1, 28, 28])
# 1개 image의 pixel값이 어떤 범위에 있는지 ,# output 출력할 때 활성화 함수 정하기 위해 필요
print(torch.min(images), torch.max(images)) 

torch.Size([64, 1, 28, 28])
tensor(0.) tensor(1.)


In [4]:
# nn.Sequential 짤 때, 콤마(,) 점(.) 으로 찍는 것 조심
class Autoencoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder=nn.Sequential(
        nn.Linear(28*28, 250),
        nn.ReLU(),
        nn.Linear(250,50),
        nn.ReLU(),
        nn.Linear(50,10)  # (N,10)
    )

    self.decoder=nn.Sequential(
        nn.Linear(10,50),
        nn.ReLU(),
        nn.Linear(50,250),
        nn.ReLU(),
        nn.Linear(250,784),
        nn.Sigmoid()  # 입력이 0-1 사이 이므로 출력도 0-1 사이가 되도록
    )

  # 복원된 image 구하기
  def forward(self,x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded # encoded 로 바꾸면 latent vector 출력 가능

  # embedding -> latent vector 구하기
  def encoding(self,x):
    encoded = self.encoder(x)
    return encoded

In [5]:
sample_data=images[0].view(-1,784)
# print(sample_data) # orginal image (1,784)

model = Autoencoder()
# print(model(sample_data)) # 생성자에 바로 data 붙어서 forward 함수 사용 가능한 듯, 재구성된 image (1,784)

encoded_data=model.encoding(sample_data) # encode된 data
print(encoded_data.shape) # torch.Size([1, 10]) 
print(encoded_data) # tensor([[-0.1001,  0.0028, -0.0790, -0.1277,  0.1153,  0.0227,  0.0345,  0.1823, 0.0391, -0.0742]], grad_fn=<AddmmBackward0>)

print(torch.min(encoded_data), torch.max(encoded_data))

print(mnist_data[0][0].shape)

torch.Size([1, 10])
tensor([[-0.1005,  0.0969, -0.0490, -0.0658,  0.1513, -0.0756, -0.0687,  0.0382,
          0.1279,  0.0677]], grad_fn=<AddmmBackward0>)
tensor(-0.1005, grad_fn=<MinBackward1>) tensor(0.1513, grad_fn=<MaxBackward1>)
torch.Size([1, 28, 28])


In [6]:
# heuristic 한 방식으로 label 값 찾기 -> mnist_data는 순서가 정해져 있음 
print(mnist_data[1][1]) # 0
print(mnist_data[3][1]) # 1
print(mnist_data[5][1]) # 2
print(mnist_data[7][1]) # 3
print(mnist_data[2][1]) # 4
print(mnist_data[0][1]) # 5
print(mnist_data[13][1]) # 6
print(mnist_data[15][1]) # 7
print(mnist_data[17][1]) # 8
print(mnist_data[4][1]) # 9

0
1
2
3
4
5
6
7
8
9


In [7]:
# heuristic 한 방식으로 초기 중심점 잡기
c_0 = model.encoder(mnist_data[1][0].view(-1,28*28)) #0
c_1 = model.encoder(mnist_data[3][0].view(-1,28*28)) #1
c_2 = model.encoder(mnist_data[5][0].view(-1,28*28)) #2
c_3 = model.encoder(mnist_data[7][0].view(-1,28*28)) #3
c_4 = model.encoder(mnist_data[2][0].view(-1,28*28)) #4
c_5 = model.encoder(mnist_data[0][0].view(-1,28*28)) #5
c_6 = model.encoder(mnist_data[13][0].view(-1,28*28)) #6
c_7 = model.encoder(mnist_data[15][0].view(-1,28*28)) #7
c_8 = model.encoder(mnist_data[17][0].view(-1,28*28)) #8
c_9 = model.encoder(mnist_data[4][0].view(-1,28*28)) #9

print(c_0)

tensor([[-0.0695,  0.1015, -0.0556, -0.1002,  0.1420, -0.0953, -0.0768, -0.0020,
          0.1777,  0.0482]], grad_fn=<AddmmBackward0>)


In [8]:
# 초기 중심값 선정방법 1 -> 0-9 까지의 label data 를 encoding 시킨 것 
init_centroids={}
for i in range(10):
  init_centroids['C{}'.format(i)]=0 # 딕셔너리 key 값 동적생성 
init_centroids['C0']=c_0
init_centroids['C1']=c_1
init_centroids['C2']=c_2
init_centroids['C3']=c_3
init_centroids['C4']=c_4
init_centroids['C5']=c_5
init_centroids['C6']=c_6
init_centroids['C7']=c_7
init_centroids['C8']=c_8
init_centroids['C9']=c_9
print(init_centroids)
# 초기 중심값 선정방법 2 -> 그냥 random 으로 중심점 선택

{'C0': tensor([[-0.0695,  0.1015, -0.0556, -0.1002,  0.1420, -0.0953, -0.0768, -0.0020,
          0.1777,  0.0482]], grad_fn=<AddmmBackward0>), 'C1': tensor([[-0.1131,  0.1101, -0.0389, -0.0649,  0.1785, -0.0929, -0.0736, -0.0308,
          0.1507,  0.0132]], grad_fn=<AddmmBackward0>), 'C2': tensor([[-0.1033,  0.0404, -0.0079, -0.0621,  0.1508, -0.0980, -0.1022,  0.0094,
          0.1416,  0.0501]], grad_fn=<AddmmBackward0>), 'C3': tensor([[-0.1176,  0.1069, -0.0400, -0.0330,  0.1310, -0.0994, -0.1153, -0.0226,
          0.1732,  0.0363]], grad_fn=<AddmmBackward0>), 'C4': tensor([[-0.0706,  0.1019, -0.0239, -0.0270,  0.1195, -0.0892, -0.1271, -0.0310,
          0.1878,  0.0314]], grad_fn=<AddmmBackward0>), 'C5': tensor([[-0.0994,  0.1029, -0.0283, -0.0485,  0.1715, -0.1305, -0.1163, -0.0481,
          0.1695,  0.0313]], grad_fn=<AddmmBackward0>), 'C6': tensor([[-0.0826,  0.1012, -0.0687, -0.0754,  0.1566, -0.0916, -0.0856,  0.0036,
          0.1964,  0.0615]], grad_fn=<AddmmBackward0>)

In [9]:
# 가장 가까운 key 값 return 하는 함수
# dictinary에 'C0' , 'C1' ... 이 KEY로 들어가고 value 가 tensor 로 만들고 정렬해서 최소값에 해당하는 key 뽑기
def closest_center(centroids, encoded_x):
  distances={}
  for key in centroids.keys():
    distance=torch.sum((centroids[key] - encoded_x)**2).item()
    distances[key]=distance
  # print(distances) # {'C0': 0.0, 'C1': 0.012900883331894875, 'C2': 0.006580686662346125 .... }
  return sorted(distances.items(), key=lambda x:x[1])[0][0]  # [('C3', 0.0001), ('CO', 0.01), ('C1', 0.2), ... ] 에서 첫 번째 튜플의 첫번 째 원소

print(closest_center(init_centroids, model.encoding(mnist_data[1][0].view(-1,28*28)))) # 'str' type

C0


In [10]:
print(mnist_data[200][1]) #4
A=model.encoding(mnist_data[200][0].view(-1,784))
B=init_centroids['C6']
print(A)
print(B)
print(A-B)
print(torch.sum((A-B)**2)) # 매번 갱신되는 cluster_loss 값
closest_center(init_centroids,model.encoding(mnist_data[200][0].view(-1,784)) )

1
tensor([[-0.0736,  0.1058, -0.0678, -0.0738,  0.1715, -0.0827, -0.1000,  0.0182,
          0.1520,  0.0402]], grad_fn=<AddmmBackward0>)
tensor([[-0.0826,  0.1012, -0.0687, -0.0754,  0.1566, -0.0916, -0.0856,  0.0036,
          0.1964,  0.0615]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0090,  0.0046,  0.0009,  0.0016,  0.0149,  0.0089, -0.0143,  0.0146,
         -0.0444, -0.0213]], grad_fn=<SubBackward0>)
tensor(0.0033, grad_fn=<SumBackward0>)


'C6'

In [11]:
# 할당 dictionary 초기화
# label 당 개수로 확인하는 코드 -> 29초 걸림
# 차라리 index를 저장하는 방향으로 하면 더 나을것 같기도
init_assignments={}
for i in range(10):
  init_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

for key in init_assignments.keys():
  for i in range(10):
    init_assignments[key][i]=0 # 해당 키의 개수

print(init_assignments)

# 초기 x값들을 초기 중심에 할당 , 60000 개 학습하는데 30초
for img,label in mnist_data:
  new_img = img.view(-1,28*28)
  encoded_img=model.encoding(new_img)
  key = closest_center(init_centroids, encoded_img)

  dict = init_assignments[key] # 해당 key에 맞는 딕셔너리 불러옴
  dict[label]=dict[label]+1



{'C0': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C1': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C2': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C3': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C4': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C5': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C6': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C7': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C8': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 'C9': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}}


In [12]:
print(init_assignments)

{'C0': {0: 704, 1: 946, 2: 136, 3: 894, 4: 650, 5: 943, 6: 677, 7: 1157, 8: 456, 9: 1250}, 'C1': {0: 149, 1: 2228, 2: 2001, 3: 1110, 4: 1020, 5: 372, 6: 470, 7: 1383, 8: 1497, 9: 626}, 'C2': {0: 568, 1: 326, 2: 292, 3: 267, 4: 316, 5: 490, 6: 271, 7: 279, 8: 196, 9: 469}, 'C3': {0: 1017, 1: 286, 2: 1412, 3: 1412, 4: 1634, 5: 763, 6: 1028, 7: 457, 8: 991, 9: 767}, 'C4': {0: 770, 1: 7, 2: 142, 3: 255, 4: 91, 5: 229, 6: 431, 7: 417, 8: 160, 9: 153}, 'C5': {0: 144, 1: 47, 2: 787, 3: 686, 4: 35, 5: 262, 6: 173, 7: 17, 8: 390, 9: 22}, 'C6': {0: 1319, 1: 864, 2: 275, 3: 286, 4: 338, 5: 583, 6: 1694, 7: 337, 8: 416, 9: 715}, 'C7': {0: 389, 1: 1786, 2: 677, 3: 437, 4: 1053, 5: 439, 6: 594, 7: 1258, 8: 419, 9: 896}, 'C8': {0: 465, 1: 132, 2: 176, 3: 564, 4: 467, 5: 930, 6: 540, 7: 735, 8: 1154, 9: 486}, 'C9': {0: 398, 1: 120, 2: 60, 3: 220, 4: 238, 5: 410, 6: 40, 7: 225, 8: 172, 9: 565}}


In [13]:
# 할당 dictionary 초기화
# label 당 X 값들 (1,784) 을 저장하는 코드 -> 31초
init_assignments={}
for i in range(10):
  init_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

for key in init_assignments.keys():
  for i in range(10):
    init_assignments[key][i]=[]

print(init_assignments)

# 초기 x값들을 초기 중심에 할당 , 60000 개 학습하는데 30초
for img,label in mnist_data:
  new_img = img.view(-1,28*28)
  encoded_img=model.encoding(new_img)
  key = closest_center(init_centroids, encoded_img)

  dict = init_assignments[key] # 해당 key에 맞는 딕셔너리 불러옴
  dict[label].append(new_img)


{'C0': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C1': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C2': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C3': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C4': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C5': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C6': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C7': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C8': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}, 'C9': {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}}


In [14]:
# 너무 data 많아서 출력 오류가 나는 듯함 -> 해결 필요
# print(init_assignments['C7'][7]) # len(init_assignments['C7'][7])=421

In [15]:
a=torch.tensor([[0.0,0.0,0.0,0.0]])
b=torch.tensor([[1.0,1.0,1.0,1.0]])
print((a+b)/2)

tensor([[0.5000, 0.5000, 0.5000, 0.5000]])


In [16]:
# 중심 update 하는 함수 -> 6초 걸림
def update_centroid(cluster_assignments, model):
  dict={}
  for key in cluster_assignments.keys():
    sum = torch.tensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    len=0
    for i in cluster_assignments[key].keys():
      for x in cluster_assignments[key][i]:
        sum = sum + model.encoding(x)
        len = len + 1
    dict[key]= sum / len
  return dict

# print(update_centroid(init_assignments,model))

In [17]:
# 새로운 중심에 x 들을 할당하는 함수
def assign_cluster(cluster_centroids, model):
  cluster_assignments={}
  for i in range(10):
    cluster_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 , value는 딕셔너리 형태로 들어감

  for key in cluster_assignments.keys():
    for i in range(10):
      cluster_assignments[key][i]=[]

  for img,label in mnist_data:
    new_img = img.view(-1,28*28)
    encoded_data = model.encoding(new_img)
    key = closest_center(cluster_centroids, encoded_data)
    dict = cluster_assignments[key]
    dict[label].append(new_img)
  
  return cluster_assignments

In [18]:
# cluster loss update 하는 함수
def update_cluster_loss(cluster_centroids, cluster_assignments, model):
  cluster_loss=0.0
  for key in cluster_assignments.keys():
    centroid = cluster_centroids[key]
    for i in cluster_assignments[key].keys():
      for x in cluster_assignments[key][i]:
        encoded_data = model.encoding(x)
        cluster_loss = cluster_loss + torch.sum((encoded_data-centroid)**2).item()       
  
  return cluster_loss


  

In [19]:
# 각 중심마다 얼마나 잘 모여있는지 확인하기 위한 함수
def print_accuracy(cluster_assignments):
  cluster_accuracy = {}
  index=0
  for key in cluster_assignments.keys():
    sum=0
    for i in cluster_assignments[key].keys():
      sum += len(cluster_assignments[key][i])
    print(sum)
    cluster_accuracy[key] = len(cluster_assignments[key][index]) / sum
    index += 1
  for key,value in cluster_accuracy.items():
    print(f'{key}:{value:.4f}', end='  ')
  print("\n")

In [20]:
print_accuracy(init_assignments)
sum2=len(init_assignments['C1'][0]) + len(init_assignments['C1'][1]) +len(init_assignments['C1'][2])+len(init_assignments['C1'][3])+len(init_assignments['C1'][4])+len(init_assignments['C1'][5])+len(init_assignments['C1'][6])+len(init_assignments['C1'][7])+len(init_assignments['C1'][8]) + len(init_assignments['C1'][9])
print(len(init_assignments['C1'][1]))
print(sum2)

7813
10856
3474
9767
2655
2563
6827
7948
5649
2448
C0:0.0901  C1:0.2052  C2:0.0841  C3:0.1446  C4:0.0343  C5:0.1022  C6:0.2481  C7:0.1583  C8:0.2043  C9:0.2308  

2228
10856


In [21]:
 # Training 과정 -> 함수화
 # loss = reconstruct_error + cluster_loss 로 일단 함
 # 1. 1epoch 동안 신경망의 매개변수 update
 # 2. 전체 훈련 data 60000 장에 대해서 clustering 진행 -> 중심 좌표들을 update
 # 3. 전체 훈련 data 에 대해서 가장 가까운 중심에 할당시킴
 # 4. 1,2,3 과정을 T (hyperparameter) 동안 반복

# autoencoder 객체 , Loss 종류, Optimizer 종류, lambda
def train(model, criterion, optimizer, T, cluster_loss, cluster_centroids, cluster_assignments):
    for t in range(T): # T번 반복
        # 1. 1epoch 동안 신경망의 매개변수 update
        for img,labels in data_loader:
            img = img.view(-1,28*28) # torch.Size([64, 784])
            recon = model.forward(img)
            MSELoss= criterion(img,recon)
            loss =  MSELoss + cluster_loss

            optimizer.zero_grad() # 배치마다 gradient 를 0으로 초기화
            loss.backward()
            optimizer.step()
      
        # 2. 전체 훈련 data 60000 장에 대해서 clustering 진행 -> 중심 좌표들을 update
        cluster_centroids = update_centroid(cluster_assignments, model) 

        # 3. update 된 중심 좌표에 x 값들 할당
        cluster_assignments = assign_cluster(cluster_centroids, model)

        # 4. clustering loss update
        cluster_loss = update_cluster_loss(cluster_centroids, cluster_assignments, model)

        # MSE loss 와 cluster loss (오토인코더에 의한 loss 와 clustering에 의한 loss)
        print(f'Iteration:{t+1}, Loss:{MSELoss:.4f}, cluster_loss:{cluster_loss}\n')

        # cluster centroid print
        print(cluster_centroids)

        # 각 label에 클러스터링이 잘 되는지 확인
        print_accuracy(cluster_assignments)
  
    return cluster_assignments


In [22]:
# T=3 일 때 3분 10초 걸림
criterion = nn.MSELoss() # Mean Squared Error 가 Loss function이 되도록 함 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3) # parameter를 갱신하는 것을 optimizer라고 함
assign = train(model,criterion, optimizer, 10, 0, init_centroids, init_assignments)


Iteration:1, Loss:0.0679, cluster_loss:465810.276199285

{'C0': tensor([[-3.2321,  4.2476, -3.0945, -7.2882,  7.7032,  5.5498,  6.3061,  4.6528,
          5.9775,  6.3612]], grad_fn=<DivBackward0>), 'C1': tensor([[-3.6106,  4.7457, -3.4574, -8.1438,  8.6056,  6.2041,  7.0513,  5.1987,
          6.6764,  7.1084]], grad_fn=<DivBackward0>), 'C2': tensor([[-2.6160,  3.4368, -2.5037, -5.8953,  6.2340,  4.4845,  5.0929,  3.7642,
          4.8398,  5.1447]], grad_fn=<DivBackward0>), 'C3': tensor([[-3.0477,  4.0049, -2.9177, -6.8712,  7.2634,  5.2309,  5.9429,  4.3868,
          5.6370,  5.9970]], grad_fn=<DivBackward0>), 'C4': tensor([[-2.6731,  3.5120, -2.5584, -6.0243,  6.3701,  4.5832,  5.2053,  3.8465,
          4.9452,  5.2575]], grad_fn=<DivBackward0>), 'C5': tensor([[-2.9882,  3.9266, -2.8606, -6.7367,  7.1215,  5.1280,  5.8257,  4.3010,
          5.5271,  5.8796]], grad_fn=<DivBackward0>), 'C6': tensor([[-3.2499,  4.2710, -3.1115, -7.3283,  7.7455,  5.5804,  6.3410,  4.6784,
         

KeyboardInterrupt: ignored

In [None]:
# 현재 상황
# 중심좌표에 할당되는 값들이 전혀 update가 되지 않고 있다
# 더불어 cluster_loss 가 오히려 계속 증가하고 있는 상황
