In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
transform = transforms.ToTensor() 
# 이미지를 pytorch tensor 로 변환
mnist_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
print(len(mnist_data)) # 60000
# tensor들로 변환된 mnist_data를 64크기의 batch로 random하게 뽑기
# shuffle 은 매 epoch마다 data가 shuffle 되도록
data_loader = torch.utils.data.DataLoader(dataset=mnist_data, batch_size=64, shuffle=True) # 60000장의 data를 64 장씩 random 으로 뽑아서 총 937개가 있는 data -> data와 label 로 구성되어 있음

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

60000


In [3]:
dataiter = iter(data_loader)
images, labels = dataiter.next() # batch size가 64인 data [64,1,28,28]
print(images.shape) # torch.Size([64, 1, 28, 28])
print(labels.shape) 
# 1개 image의 pixel값이 어떤 범위에 있는지 ,# output 출력할 때 활성화 함수 정하기 위해 필요
print(torch.min(images), torch.max(images)) 

torch.Size([64, 1, 28, 28])
torch.Size([64])
tensor(0.) tensor(1.)


In [4]:
# nn.Sequential 짤 때, 콤마(,) 점(.) 으로 찍는 것 조심
class Autoencoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder=nn.Sequential(
        nn.Linear(784, 250),
        nn.ReLU(),
        nn.Linear(250,50),
        nn.ReLU(),
        nn.Linear(50,10)  # (N,10) 
    )

    self.decoder=nn.Sequential(
        nn.Linear(10,50),
        nn.ReLU(),
        nn.Linear(50,250),
        nn.ReLU(),
        nn.Linear(250,784),
        nn.Sigmoid()  # 입력이 0-1 사이 이므로 출력도 0-1 사이가 되도록
    )

  # 복원된 image 구하기
  def forward(self,x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return encoded, decoded # (encoded, decoded) 나란히 튜플로 나오게끔

  # embedding -> latent vector 구하기
  def encoding(self,x):
    encoded = self.encoder(x)
    return encoded

  # forwarding -> recon 구하기
  def forwarding(self,x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

In [5]:
# Autoencdoer 모델 생성
model=Autoencoder()

In [6]:
# 가장 가까운 key 값 return 하는 함수
# dictinary에 'C0' , 'C1' ... 이 KEY로 들어가고 value 가 tensor 로 만들고 정렬해서 최소값에 해당하는 key 뽑기
def closest_center(centroids, encoded_x):
  distances={}
  for key in centroids.keys():
    distance=torch.sum((centroids[key] - encoded_x)**2).item()
    distances[key]=distance
  # print(distances) # {'C0': 0.0, 'C1': 0.012900883331894875, 'C2': 0.006580686662346125 .... }
  return sorted(distances.items(), key=lambda x:x[1])[0][0]  # [('C3', 0.0001), ('CO', 0.01), ('C1', 0.2), ... ] 에서 첫 번째 튜플의 첫번 째 원소

# print(closest_center(init_centroids, model.encoding(mnist_data[1][0].view(-1,28*28)))) # 'str' type

In [7]:
# Batch 로 넘어온 data (64,10) 을 하나하나 중심들과의 거리를 계산하고 최솟값을 더해간다
def clustering_loss(centroids, encoded_x):
  temp_cluster_loss=0
  for i in range(encoded_x.shape[0]):
    distances=[]
    for key in centroids.keys():
      distance=torch.sum((centroids[key]-encoded_x[i])**2).item()
      distances.append(distance)
    temp_cluster_loss+=min(distances)
  return temp_cluster_loss

In [8]:
# 가장 가까운 key 값 return 하는 함수
# dictinary에 'C0' , 'C1' ... 이 KEY로 들어가고 value 가 tensor 로 만들고 정렬해서 최소값에 해당하는 key 뽑기
def closest_center(centroids, encoded_x):
  distances={}
  for key in centroids.keys():
    distance=torch.sum((centroids[key] - encoded_x)**2).item()
    distances[key]=distance
  # print(distances) # {'C0': 0.0, 'C1': 0.012900883331894875, 'C2': 0.006580686662346125 .... }
  return sorted(distances.items(), key=lambda x:x[1])[0][0]  # [('C3', 0.0001), ('CO', 0.01), ('C1', 0.2), ... ] 에서 첫 번째 튜플의 첫번 째 원소

In [9]:
# accuracy 뽑아내는 함수
# 일단은 각 중심에 속한 data 개수 뽑아내는 함수
def print_accuracy(assignments):
  cluster_accuracy={}
  index=0
  for key in assignments.keys():
    cluster_accuracy[key]={}
    sum=0
    for i in assignments[key].keys():
      sum+=len(assignments[key][i])
    cluster_accuracy[key]= len(assignments[key][index])/sum
    index+=1
  
  print(cluster_accuracy)
  print("\n")
    
  
 

In [10]:
# 1 epoch 만 먼저 돌려 놓기
def pretrain(criterion,optimizer):
  for img,label in data_loader:
    img=img.view(-1,28*28)
    recon=model.forwarding(img)
    loss=criterion(img,recon) 

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [11]:
criterion = nn.MSELoss() # Mean Squared Error 가 Loss function이 되도록 함 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3) # parameter를 갱신하는 것을 optimizer라고 함
# 미리 1 epoch 돌려 놓기
pretrain(criterion, optimizer)

In [12]:
# heuristic 한 방식으로 label 값 찾기 -> mnist_data는 순서가 정해져 있음 
print(mnist_data[1][1]) # 0
print(mnist_data[3][1]) # 1
print(mnist_data[5][1]) # 2
print(mnist_data[7][1]) # 3
print(mnist_data[2][1]) # 4
print(mnist_data[0][1]) # 5
print(mnist_data[13][1]) # 6
print(mnist_data[15][1]) # 7
print(mnist_data[17][1]) # 8
print(mnist_data[4][1]) # 9

0
1
2
3
4
5
6
7
8
9


In [13]:
# heuristic 한 방식으로 초기 중심점 잡기
c_0 = model.encoder(mnist_data[1][0].view(-1,28*28)) #0
c_1 = model.encoder(mnist_data[3][0].view(-1,28*28)) #1
c_2 = model.encoder(mnist_data[5][0].view(-1,28*28)) #2
c_3 = model.encoder(mnist_data[7][0].view(-1,28*28)) #3
c_4 = model.encoder(mnist_data[2][0].view(-1,28*28)) #4
c_5 = model.encoder(mnist_data[0][0].view(-1,28*28)) #5
c_6 = model.encoder(mnist_data[13][0].view(-1,28*28)) #6
c_7 = model.encoder(mnist_data[15][0].view(-1,28*28)) #7
c_8 = model.encoder(mnist_data[17][0].view(-1,28*28)) #8
c_9 = model.encoder(mnist_data[4][0].view(-1,28*28)) #9

print(c_0)

tensor([[ 4.7659, -5.2743, -3.5442, -1.8255,  4.6099,  5.4175,  3.4552, -3.4780,
          2.6078, -3.7726]], grad_fn=<AddmmBackward0>)


In [14]:
# 초기 중심값 선정방법 1 -> 0-9 까지의 label data 를 encoding 시킨 것 
init_centroids={}
for i in range(10):
  init_centroids['C{}'.format(i)]=0 # 딕셔너리 key 값 동적생성 
init_centroids['C0']=c_0
init_centroids['C1']=c_1
init_centroids['C2']=c_2
init_centroids['C3']=c_3
init_centroids['C4']=c_4
init_centroids['C5']=c_5
init_centroids['C6']=c_6
init_centroids['C7']=c_7
init_centroids['C8']=c_8
init_centroids['C9']=c_9
print(init_centroids)
# 초기 중심값 선정방법 2 -> 그냥 random 으로 중심점 선택

{'C0': tensor([[ 4.7659, -5.2743, -3.5442, -1.8255,  4.6099,  5.4175,  3.4552, -3.4780,
          2.6078, -3.7726]], grad_fn=<AddmmBackward0>), 'C1': tensor([[ 6.2032, -6.8691, -4.6147, -2.3770,  6.0026,  7.0740,  4.4989, -4.5326,
          3.3956, -4.9115]], grad_fn=<AddmmBackward0>), 'C2': tensor([[ 7.1669, -7.9384, -5.3324, -2.7467,  6.9362,  8.1845,  5.1986, -5.2396,
          3.9237, -5.6751]], grad_fn=<AddmmBackward0>), 'C3': tensor([[ 8.1198, -8.9957, -6.0421, -3.1123,  7.8594,  9.2827,  5.8905, -5.9387,
          4.4460, -6.4301]], grad_fn=<AddmmBackward0>), 'C4': tensor([[ 3.0669, -3.3891, -2.2788, -1.1737,  2.9639,  3.4595,  2.2215, -2.2315,
          1.6766, -2.4264]], grad_fn=<AddmmBackward0>), 'C5': tensor([[ 6.9599, -7.7087, -5.1782, -2.6673,  6.7356,  7.9460,  5.0483, -5.0877,
          3.8103, -5.5110]], grad_fn=<AddmmBackward0>), 'C6': tensor([[ 7.0914, -7.8546, -5.2762, -2.7177,  6.8631,  8.0975,  5.1438, -5.1842,
          3.8824, -5.6153]], grad_fn=<AddmmBackward0>)

In [16]:
# Training 과정
# 1 epoch 동안 batch 단위로 w,b값이 update 되면서 동시에 cluster loss 또한 update 되어감
def training(criterion, optimizer, T, cluster_centroids):
  for t in range(T):
    for img,label in data_loader:
      cluster_loss=0
      img=img.view(-1,784) # torch.Size([64, 784])
      encoded_data, recon =model.forward(img)
      MSE = criterion(img,recon)
      cluster_loss += clustering_loss(cluster_centroids,encoded_data)
      loss = MSE + 0.001*cluster_loss # 일단 MSE + cluster_loss

      optimizer.zero_grad() # 배치마다 gradient 를 0으로 초기화
      loss.backward()
      optimizer.step()

    # 이전 중심에 할당하는 작업
    # 10 은 label 개수
    # 할당되는 값은 index를 넣어서 좀 더 효율적으로 계산할 수 있게끔 한다
    cluster_assignments={}
    for i in range(10):
      cluster_assignments['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 
    for key in cluster_assignments.keys():
      for i in range(10):
        cluster_assignments[key][i]=[]

    for index,(img,label) in enumerate(mnist_data):
      new_img=img.view(-1,784)
      encoded_data=model.encoding(new_img)
      key = closest_center(cluster_centroids, encoded_data)
      dictionary = cluster_assignments[key]
      dictionary[label].append(index)
    
    # 중심 update -> 60000 장에 대해서 중심
    new_cluster={}
    for i in range(10):
      new_cluster['C{}'.format(i)]={} # 딕셔너리 key 값 동적생성 
    
    for key in cluster_assignments.keys():
      sum = torch.tensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
      count=0
      for i in cluster_assignments[key].keys():
        count+=len(cluster_assignments[key][i]) # 해당 label 의 개수
        for index in cluster_assignments[key][i]: # list
          sum+=model.encoding(mnist_data[index][0].view(-1,784)) # 해당 index의 image를 encoding 시킨 값을 계속 더함
      
      new_cluster[key]=sum/count
    
    cluster_centroids=new_cluster # 기존의 중심을 새로운 중심으로 update

    # 1epoch 끝난 후 MSE loss 와 cluster loss 
    print(f'Iteration:{t+1}, Loss:{MSE:.4f}, cluster_loss:{cluster_loss}\n')
    print(new_cluster)
    print("\n")

    print_accuracy(cluster_assignments)

In [17]:
criterion = nn.MSELoss() # Mean Squared Error 가 Loss function이 되도록 함 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3) # parameter를 갱신하는 것을 optimizer라고 함
training(criterion,optimizer, 10, init_centroids)

Iteration:1, Loss:0.0665, cluster_loss:38.14616673439741

{'C0': tensor([[ 4.8326, -5.4162, -3.5983, -1.8118,  4.6833,  5.4490,  3.5277, -3.5563,
          2.6370, -3.7819]], grad_fn=<DivBackward0>), 'C1': tensor([[ 6.0560, -6.7873, -4.5092, -2.2705,  5.8688,  6.8283,  4.4208, -4.4566,
          3.3046, -4.7393]], grad_fn=<DivBackward0>), 'C2': tensor([[ 7.2808, -8.1600, -5.4213, -2.7298,  7.0559,  8.2094,  5.3150, -5.3580,
          3.9730, -5.6979]], grad_fn=<DivBackward0>), 'C3': tensor([[ 8.0847, -9.0610, -6.0199, -3.0312,  7.8349,  9.1158,  5.9018, -5.9496,
          4.4117, -6.3270]], grad_fn=<DivBackward0>), 'C4': tensor([[ 3.3548, -3.7599, -2.4979, -1.2577,  3.2511,  3.7827,  2.4489, -2.4687,
          1.8306, -2.6254]], grad_fn=<DivBackward0>), 'C5': tensor([[ 6.8073, -7.6293, -5.0687, -2.5522,  6.5970,  7.6755,  4.9693, -5.0095,
          3.7147, -5.3273]], grad_fn=<DivBackward0>), 'C6': tensor([[ 7.0833, -7.9387, -5.2742, -2.6557,  6.8645,  7.9867,  5.1708, -5.2127,
        

In [15]:
import numpy as np
batch_mask=np.random.choice(100,10)
batch_mask

array([41, 85, 19, 70, 45, 58, 80, 30, 91, 96])