In [1]:
#드라이브에 저장되어있는 데이터 불러오기
cd /content/drive/My Drive/Colab Notebooks/kaggle_dataset

/content/drive/My Drive/Colab Notebooks/kaggle_dataset


In [2]:
ls -ltr

total 102857
-rw------- 1 root root   158651 Mar 13 04:02 sample_submission.csv
-rw------- 1 root root 31527613 Mar 13 04:02 test_df.csv
-rw------- 1 root root 73356919 Mar 13 04:03 train_df.csv
-rw------- 1 root root   140650 Mar 16 09:12 relu_adam_submission.csv
-rw------- 1 root root   140650 Mar 16 09:17 nobatchnorm_relu_adam_submission.csv


In [0]:
import numpy as np
import pandas as pd

sample_submission = pd.read_csv("sample_submission.csv")
train = pd.read_csv("train_df.csv")
test = pd.read_csv("test_df.csv")

In [0]:
#데이터 스케일링
X = train.iloc[:,1:].values / 255
y = train.iloc[:,0].values
X_test = test.iloc[:,1:].values / 255

In [5]:
#검증을 위해 validation dataset을 추가해보자
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=2020)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(33600, 784) (8400, 784) (33600,) (8400,)


In [6]:
y_train

array([1, 8, 8, ..., 6, 7, 8])

In [70]:
y_val

array([4, 9, 5, ..., 3, 4, 6])

## DataLoader
모든 데이터를 나누고, 일일히 그 데이터를 나눠서 forward와 backward를 돌리는 식으로 진행해야 하는데, 파이토치에서 제공해주는 DataLoader을 쓴다면 그럴 필요가 없다. 그냥 DataLoader로 부터 batch_size만큼의 데이터를 받아오면 된다.

출처: https://wingnim.tistory.com/33 [jinyo의 뇌]

우리가 직접 만드는 custom dataloader은 다음과 같은 세 파트로 이루어져 있다.

1. __init__(self)

download, read data 등등을 하는 파트.


2. __getitem__(self,index)

인덱스에 해당하는 아이템을 넘겨주는 파트.


3. __len__(self)

data size를 넘겨주는 파트


In [0]:
#Dataset과 Loader 정의

from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, X, y, transform=None): #1. download, read data, etc
        self.X = X
        self.y = y
        self.transform = transform # 만약 이미지 데이터셋이라면 transform에 rotation, shear, crop 등등이 들어갈 수 있음
    
    def __getitem__(self, idx): #2. return one item on the index
        X = self.X[idx]
        y = self.y[idx]
        return X, y

    def __len__(self): #3. return the data length
        return len(self.X) 

class ValidDataset(Dataset):
    def __init__(self, X, y, transform=None): #1. download, read data, etc
        self.X = X
        self.y = y
        self.transform = transform
    
    def __getitem__(self, idx): #2. return one item on the index
        X = self.X[idx]
        y = self.y[idx]
        return X,y

    def __len__(self): #3. return the data length
        return len(self.X)    

class TestDataset(Dataset):
    def __init__(self, X, y, transform=None): #1. download, read data, etc
        self.X = X
        self.y = y
        self.transform = transform
    
    def __getitem__(self, idx): #2. return one item on the index
        X = self.X[idx]
        return X

    def __len__(self): #3. return the data length
        return len(self.X)        

In [0]:
# train, valid, test dataloader 생성
#dataloader에서 batch_size 지정해주고 shuffle = True (train일 경우)
#test의 경우는 shuffle=False
traindataloader = DataLoader(TrainDataset(x_train, y_train), batch_size=128, shuffle=True, num_workers=4) #num_workers는 멀티쓰레딩을 지원하여 빠르게 데이터를 가져올 수 있다
validdataloader = DataLoader(ValidDataset(x_val, y_val), batch_size=64, shuffle=True, num_workers=4)
testdataloader = DataLoader(TestDataset(X_test, y=None), batch_size=4, shuffle=False, num_workers=4)

In [0]:
# NN 모델 생성
# 각각 weight initializer(he), batch normalization, dropout(0.4 or 0.5) 에 변화를 줘가면서 3 layers or 4 layers로 구성했다. activation func은 relu로 통일
class Net(nn.Module):
  def __init__(self): 
        super(Net, self).__init__()
        self.l1 = nn.Linear(784,512)
        #nn.init.kaiming_normal(self.l1.weight, nonlinearity='relu')
        self.bn1 = nn.BatchNorm1d(512)
        self.relu1 = nn.ReLU(inplace=True)
        #self.drop1 = torch.nn.Dropout(p=0.4)

        self.l2 = nn.Linear(512,256)
        #nn.init.kaiming_normal(self.l2.weight, nonlinearity='relu')
        self.bn2 = nn.BatchNorm1d(256)
        self.relu2 = nn.ReLU(inplace=True)
        #self.drop2 = torch.nn.Dropout(p=0.4)

        self.l3 = nn.Linear(256,128)
        #nn.init.kaiming_normal(self.l3.weight, nonlinearity='relu')
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU(inplace=True)
        #self.drop3 = torch.nn.Dropout(p=0.4)

        self.l4 = nn.Linear(128,10)
        #nn.init.kaiming_normal(self.l4.weight, nonlinearity='relu')

  def forward(self, x):
        x = self.l1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        #x = self.drop1(x)
        x = self.l2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        #x = self.drop2(x)
        x = self.l3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        #x = self.drop3(x)
        x = self.l4(x)
        ##소프트맥스 안걸어줘도된다
        #뒤에서 crossentropyloss 걸어주면 소프트맥스가 자동으로 적용된다고 한다
        return x

net = Net()



In [112]:
net

Net(
  (l1): Linear(in_features=784, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (l2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (l3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace=True)
  (l4): Linear(in_features=128, out_features=10, bias=True)
)

In [0]:
#Adam optimizer로 설정했다
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=1e-3)

In [114]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [115]:

net.to(device)

Net(
  (l1): Linear(in_features=784, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (l2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (l3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace=True)
  (l4): Linear(in_features=128, out_features=10, bias=True)
)

In [78]:
# 3 layers, weight initialization(he), batch normalization, dropout(0.5)
trn_loss_list = []
val_loss_list = []
num_epochs = 100
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/100 | step: 100/263 | trn loss: 2.4427 | val loss: 2.0571
epoch: 1/100 | step: 200/263 | trn loss: 1.8218 | val loss: 1.6573
epoch: 2/100 | step: 100/263 | trn loss: 1.3588 | val loss: 1.2812
epoch: 2/100 | step: 200/263 | trn loss: 1.1904 | val loss: 1.1574
epoch: 3/100 | step: 100/263 | trn loss: 1.0123 | val loss: 0.9711
epoch: 3/100 | step: 200/263 | trn loss: 0.9342 | val loss: 0.8999
epoch: 4/100 | step: 100/263 | trn loss: 0.8163 | val loss: 0.8030
epoch: 4/100 | step: 200/263 | trn loss: 0.7588 | val loss: 0.7489
epoch: 5/100 | step: 100/263 | trn loss: 0.6932 | val loss: 0.6835
epoch: 5/100 | step: 200/263 | trn loss: 0.6555 | val loss: 0.6604
epoch: 6/100 | step: 100/263 | trn loss: 0.5969 | val loss: 0.5976
epoch: 6/100 | step: 200/263 | trn loss: 0.5728 | val loss: 0.5704
epoch: 7/100 | step: 100/263 | trn loss: 0.5413 | val loss: 0.5292
epoch: 7/100 | step: 200/263 | trn loss: 0.5110 | val loss: 0.5075
epoch: 8/100 | step: 100/263 | trn loss: 0.4818 | val loss: 0.

3층, weight initialization, batch norm, dropout 셋 다 쓰는건 성능 so bad 

In [84]:
# 4 layers, weight initialization(he), dropout(0.5)
trn_loss_list = []
val_loss_list = []
num_epochs = 100
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/100 | step: 100/263 | trn loss: 2.2025 | val loss: 1.9593
epoch: 1/100 | step: 200/263 | trn loss: 1.7052 | val loss: 1.4076
epoch: 2/100 | step: 100/263 | trn loss: 0.9714 | val loss: 0.8318
epoch: 2/100 | step: 200/263 | trn loss: 0.7378 | val loss: 0.6511
epoch: 3/100 | step: 100/263 | trn loss: 0.5450 | val loss: 0.4995
epoch: 3/100 | step: 200/263 | trn loss: 0.4802 | val loss: 0.4459
epoch: 4/100 | step: 100/263 | trn loss: 0.3976 | val loss: 0.3801
epoch: 4/100 | step: 200/263 | trn loss: 0.3783 | val loss: 0.3560
epoch: 5/100 | step: 100/263 | trn loss: 0.3339 | val loss: 0.3242
epoch: 5/100 | step: 200/263 | trn loss: 0.3133 | val loss: 0.3112
epoch: 6/100 | step: 100/263 | trn loss: 0.2931 | val loss: 0.2814
epoch: 6/100 | step: 200/263 | trn loss: 0.2716 | val loss: 0.2614
epoch: 7/100 | step: 100/263 | trn loss: 0.2563 | val loss: 0.2530
epoch: 7/100 | step: 200/263 | trn loss: 0.2385 | val loss: 0.2557
epoch: 8/100 | step: 100/263 | trn loss: 0.2337 | val loss: 0.

4층, weight initialization, dropout만 쓴 얘도 별로... 

In [92]:
# 4 layers, weight initialization(he), batch normalization
trn_loss_list = []
val_loss_list = []
num_epochs = 100
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/100 | step: 100/263 | trn loss: 1.1602 | val loss: 0.5769
epoch: 1/100 | step: 200/263 | trn loss: 0.4292 | val loss: 0.3414
epoch: 2/100 | step: 100/263 | trn loss: 0.2189 | val loss: 0.2274
epoch: 2/100 | step: 200/263 | trn loss: 0.1855 | val loss: 0.1915
epoch: 3/100 | step: 100/263 | trn loss: 0.1263 | val loss: 0.1569
epoch: 3/100 | step: 200/263 | trn loss: 0.1238 | val loss: 0.1444
epoch: 4/100 | step: 100/263 | trn loss: 0.0917 | val loss: 0.1276
epoch: 4/100 | step: 200/263 | trn loss: 0.0905 | val loss: 0.1197
epoch: 5/100 | step: 100/263 | trn loss: 0.0690 | val loss: 0.1069
epoch: 5/100 | step: 200/263 | trn loss: 0.0690 | val loss: 0.1011
epoch: 6/100 | step: 100/263 | trn loss: 0.0499 | val loss: 0.0986
epoch: 6/100 | step: 200/263 | trn loss: 0.0558 | val loss: 0.0929
epoch: 7/100 | step: 100/263 | trn loss: 0.0436 | val loss: 0.0906
epoch: 7/100 | step: 200/263 | trn loss: 0.0396 | val loss: 0.0847
epoch: 8/100 | step: 100/263 | trn loss: 0.0337 | val loss: 0.

dropout 대신 batchnorm만 적용했더니 val loss 0.05로 올랐다

In [116]:
# 4 layers, batch normalization
trn_loss_list = []
val_loss_list = []
num_epochs = 100
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/100 | step: 100/263 | trn loss: 1.0334 | val loss: 0.5659
epoch: 1/100 | step: 200/263 | trn loss: 0.4274 | val loss: 0.3389
epoch: 2/100 | step: 100/263 | trn loss: 0.2208 | val loss: 0.2136
epoch: 2/100 | step: 200/263 | trn loss: 0.1674 | val loss: 0.1776
epoch: 3/100 | step: 100/263 | trn loss: 0.1196 | val loss: 0.1377
epoch: 3/100 | step: 200/263 | trn loss: 0.1014 | val loss: 0.1211
epoch: 4/100 | step: 100/263 | trn loss: 0.0756 | val loss: 0.1062
epoch: 4/100 | step: 200/263 | trn loss: 0.0689 | val loss: 0.1028
epoch: 5/100 | step: 100/263 | trn loss: 0.0505 | val loss: 0.0872
epoch: 5/100 | step: 200/263 | trn loss: 0.0475 | val loss: 0.0855
epoch: 6/100 | step: 100/263 | trn loss: 0.0331 | val loss: 0.0755
epoch: 6/100 | step: 200/263 | trn loss: 0.0331 | val loss: 0.0781
epoch: 7/100 | step: 100/263 | trn loss: 0.0250 | val loss: 0.0751
epoch: 7/100 | step: 200/263 | trn loss: 0.0246 | val loss: 0.0711
epoch: 8/100 | step: 100/263 | trn loss: 0.0174 | val loss: 0.

가중치 초기화를 없애고 batch norm만 적용했더니 오히려 성능이 더 올랐다

In [104]:
# 4 layers, dropout(0.5)
trn_loss_list = []
val_loss_list = []
num_epochs = 80
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/80 | step: 100/263 | trn loss: 2.2212 | val loss: 1.9678
epoch: 1/80 | step: 200/263 | trn loss: 1.3725 | val loss: 0.8850
epoch: 2/80 | step: 100/263 | trn loss: 0.5078 | val loss: 0.4254
epoch: 2/80 | step: 200/263 | trn loss: 0.4014 | val loss: 0.3510
epoch: 3/80 | step: 100/263 | trn loss: 0.3136 | val loss: 0.2901
epoch: 3/80 | step: 200/263 | trn loss: 0.2822 | val loss: 0.2763
epoch: 4/80 | step: 100/263 | trn loss: 0.2444 | val loss: 0.2400
epoch: 4/80 | step: 200/263 | trn loss: 0.2415 | val loss: 0.2250
epoch: 5/80 | step: 100/263 | trn loss: 0.2154 | val loss: 0.2153
epoch: 5/80 | step: 200/263 | trn loss: 0.2093 | val loss: 0.2093
epoch: 6/80 | step: 100/263 | trn loss: 0.1975 | val loss: 0.1921
epoch: 6/80 | step: 200/263 | trn loss: 0.1866 | val loss: 0.1845
epoch: 7/80 | step: 100/263 | trn loss: 0.1733 | val loss: 0.1796
epoch: 7/80 | step: 200/263 | trn loss: 0.1746 | val loss: 0.1726
epoch: 8/80 | step: 100/263 | trn loss: 0.1537 | val loss: 0.1714
epoch: 8/8

가중치 초기화 없애고 dropout만 적용했더니 성능이 batch norm만 적용했을때보다 낮다

In [110]:
# 4 layers, batch normalization, dropout(0.4)
trn_loss_list = []
val_loss_list = []
num_epochs = 80
num_batches = len(traindataloader)
for epoch in range(num_epochs):  # loop over the dataset multiple times

    trn_loss = 0.0
    for i, data in enumerate(traindataloader):
        inputs, labels = data
        inputs = torch.tensor(inputs, device=device).float()
        labels = torch.tensor(labels, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # print(type(inputs), type(labels), type(outputs))
        # forward + backward + optimize
        outputs = net(inputs)
        # print(type(inputs), type(labels), type(outputs))
        loss = criterion(outputs, labels)
        # back propagation
        loss.backward()
        # weight update
        optimizer.step()
        # print statistics
        trn_loss += loss.item()

        # del(memory issue)
        del loss
        del outputs

        # 학습과정 출력
        if (i+1) % 100 == 0: # every 100 mini-batches
            with torch.no_grad(): # very very very very important!!!
                val_loss = 0.0
                for j, val in enumerate(validdataloader):
                    val_x, val_label = val
                    val_x = torch.tensor(val_x, device=device).float()
                    val_label = torch.tensor(val_label, device=device)
                    val_output = net(val_x)
                    v_loss = criterion(val_output, val_label)
                    val_loss += v_loss
                       
            print("epoch: {}/{} | step: {}/{} | trn loss: {:.4f} | val loss: {:.4f}".format(
                epoch+1, num_epochs, i+1, num_batches, trn_loss / 100, val_loss / len(validdataloader)
            ))            
            
            trn_loss_list.append(trn_loss/100)
            val_loss_list.append(val_loss/len(validdataloader))
            trn_loss = 0.0        

       

    #print("Epoch : {} loss: {}".format(epoch, running_loss))
print('Finished Training')

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


epoch: 1/80 | step: 100/263 | trn loss: 1.7695 | val loss: 1.2485
epoch: 1/80 | step: 200/263 | trn loss: 0.9898 | val loss: 0.7872
epoch: 2/80 | step: 100/263 | trn loss: 0.5503 | val loss: 0.4886
epoch: 2/80 | step: 200/263 | trn loss: 0.4247 | val loss: 0.3937
epoch: 3/80 | step: 100/263 | trn loss: 0.3079 | val loss: 0.3058
epoch: 3/80 | step: 200/263 | trn loss: 0.2758 | val loss: 0.2787
epoch: 4/80 | step: 100/263 | trn loss: 0.2224 | val loss: 0.2370
epoch: 4/80 | step: 200/263 | trn loss: 0.2135 | val loss: 0.2218
epoch: 5/80 | step: 100/263 | trn loss: 0.1796 | val loss: 0.2001
epoch: 5/80 | step: 200/263 | trn loss: 0.1674 | val loss: 0.1980
epoch: 6/80 | step: 100/263 | trn loss: 0.1436 | val loss: 0.1823
epoch: 6/80 | step: 200/263 | trn loss: 0.1457 | val loss: 0.1771
epoch: 7/80 | step: 100/263 | trn loss: 0.1235 | val loss: 0.1707
epoch: 7/80 | step: 200/263 | trn loss: 0.1249 | val loss: 0.1532
epoch: 8/80 | step: 100/263 | trn loss: 0.1109 | val loss: 0.1492
epoch: 8/8

batch norm과 dropout을 동시에 쓰면 보통 성능이 더 올라간다고 구글링에서 배웠는데 직접 여러번 실험해본 결과 이 경우엔 dropout말고 batch norm을 단독으로 썼을때가 성능이 더 높았다

따라서, 가장 성능이 괜찮았던 4 layers(input-linear-relu-linear-relu-linear-relu-linear-softmax), batch normalization, adamoptimizer로 제출하기로 했다

In [117]:
correct = 0
total = 0
net.eval()
preds = []
with torch.no_grad():
    for data in testdataloader:
        inputs = data
        inputs = torch.tensor(inputs, device=device).float()
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        for pred in predicted:
          preds.append(pred.cpu().numpy())

  


In [118]:
preds = np.array(preds)
print(preds.shape)

(18000,)


In [119]:
sample_submission["Category"] = pd.Series(preds)
sample_submission.head()

Unnamed: 0,Id,Category
0,57808,8
1,4960,0
2,35755,5
3,15543,3
4,48968,8


In [0]:
sample_submission.to_csv("torch2_submission.csv", index=False)

도대체...CNN안쓰고 성능을 0.99까지 어떻게 끌어올린것인지들,,, 우수과제 코드를 보고 공부해야겠다