In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import torch
import torchvision
from tqdm.notebook import tqdm

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
#code to read the data into one dataframe
df = pd.DataFrame()
for i in range(5):
    df_temp = pd.read_csv(f'../data/processed_chinese_mnist_part_{i}.csv')
    df = df.append(df_temp)
df = df.reset_index(drop = True)

In [3]:
df.head()

Unnamed: 0,label,cn_label,value,0,1,2,3,4,5,6,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,1,零,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,九,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11,十,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12,百,100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13,千,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Train test split, load everything in pytorch tensor

In [4]:
train_test_ratio = 0.8
df_train = df.iloc[:int(len(df) * train_test_ratio)]
df_test = df.iloc[int(len(df) * train_test_ratio):]
df_train.shape, df_test.shape

((12000, 4099), (3000, 4099))

In [5]:
## Alternatively (no longer used)

# train_arr_data = df_train.iloc[:, 3:].values.reshape(12000, 64, 64)
# test_arr_data = df_test.iloc[:, 3:].values.reshape(3000, 64, 64)

# train_arr_data.shape, test_arr_data.shape

# train_arr_target = df_train.iloc[:, 0].values
# test_arr_target = df_test.iloc[:, 0].values

# train_arr_target.shape, test_arr_target.shape

# device = 'cpu'

# training_data = torch.FloatTensor(train_arr_data).to(device)
# training_data.shape

# training_target = torch.LongTensor(train_arr_target).to(device)
# train_arr_target.shape

# testing_data = torch.FloatTensor(test_arr_data).to(device)
# testing_target = torch.LongTensor(test_arr_target).to(device)

In [6]:
class ImgDataset():
    """
    Dataset that contain the target, data as instance variables to feed 
    into torch dataloader. In order to properly load these properties, we
    need it to have at least have the len() method and make sure that it is 
    subscriptable.
    """
    def __init__(self, input_df):
        
        input_df = input_df.reset_index(drop = True)
        
        self.target = input_df['label']
        
        self.data = (input_df.iloc[:, 3:].values
                    ).reshape(len(input_df), 1, 64, 64).astype(np.float32)
        
    def __len__(self):
        
        return len(self.target)
    
    def __getitem__(self, i):
        
        return self.data[i], self.target[i]

In [7]:
trainDataSet = ImgDataset(df_train)
testDataSet = ImgDataset(df_test)

# Define batch size, which is how many samples you use for training in one iteration
batch_size_train = 32 

batch_size_test = 1024 

# Create data loaders
train_loader = torch.utils.data.DataLoader(trainDataSet,
                                           batch_size=batch_size_train, 
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(testDataSet,
                                          batch_size=batch_size_test, 
                                          shuffle=True)

In [8]:
iterator = tqdm(train_loader, total=int(len(train_loader)))
counter = 0
for batch_idx, (data, target) in enumerate(iterator):
    dataa, target = data.to('cpu'), target.to('cpu')
    break
dataa.shape

  0%|          | 0/375 [00:00<?, ?it/s]

torch.Size([32, 1, 64, 64])

In [79]:
test = nn.MaxPool2d(2, 2)(F.relu(nn.Conv2d(1, 10, (5, 5))(dataa)))
print(test.shape) #32, 20, 20, 20
test = nn.MaxPool2d(2, 2)(F.relu(nn.Conv2d(10, 30, (5, 5))(test)))
print(test.shape) #32, 50, 8, 8
test = test.view(-1, 30 * 13 * 13)
print(test.shape) #32, 3200
test = F.relu(nn.Linear(5070, 300)(test))
print(test.shape)

torch.Size([32, 10, 30, 30])
torch.Size([32, 30, 13, 13])
torch.Size([32, 5070])
torch.Size([32, 300])


In [80]:
50 * 8 * 8

3200

DL model

In [81]:
def train(model, device, train_loader, optimizer, epoch, log_interval= 10000):
    model.train()
    counter = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        counter += 1
        
def test(model, device, test_loader, num_epoch = None, print_ = False):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)

    if print_: print('Epoch {}: Test set Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        num_epoch,
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [82]:
class CNN(nn.Module):
    """
    This class defines the deep learning model that extends a Module class
      The constructor of  class defines the layers of the model. 
      The forward() function defines how to forward propagate 
      input through the defined layers of the model.
      Many layers are available, such as Linear for fully connected layers, 
      Conv2d for convolutional layers, and MaxPool2d for pooling layers.
    """
    def __init__(self):
        super().__init__()
        
        #initialize drop out function
        self.dropout = nn.Dropout(p = 0.4)
        #kernel size 5 and maxPool2d (2, 2)
        self.pool = nn.MaxPool2d(2, 2)
        
        #starting shape: 32 (batch size), 1, 30, 30
        self.conv1 = nn.Conv2d(1, 10, (5, 5)) 
        #1 channel to 10 channel, kernel size 5
        self.conv2 = nn.Conv2d(10, 30, (5, 5)) 
        #5 channel to 30 channel, kernel size 5
        
        #similar to the coding tutorial, using 3 linear layers to reduce to 10 final output
        #automatically cut down to 10 units
        self.fc1 = nn.Linear(5070, 300)
        self.fc2 = nn.Linear(300, 50)
        self.fc3 = nn.Linear(50, 20)
        self.activation = nn.LogSoftmax(dim = 1)
        
    def forward(self, x):
        x = self.pool(F.relu(self.dropout(self.conv1(x))))
        #shape: 32, 20, 30, 30
        x = self.pool(F.relu(self.dropout(self.conv2(x))))
        #shape: #32, 100, 13, 13
        x = x.view(-1, 5070) #shape 32, 5070
        x = F.relu(self.fc1(x)) #shape 32, 300
        x = F.relu(self.fc2(x)) #shape 32, 50
        x = self.activation(self.fc3(x)) #shape 32, 20
        return x
     

In [83]:
learning_rate = 0.01
momentum = 0.8
device = "cuda"
model = CNN().to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      momentum=momentum)
num_epoch = 100

for epoch in range(1, num_epoch + 1):
    train(model, device, train_loader, optimizer, epoch)
    if epoch % 5 == 0:
        test(model, device, test_loader, epoch, print_ = True)
    else:
        test(model, device, test_loader, epoch)

Epoch 5: Test set Average loss: 1.3122, Accuracy: 1734/3000 (58%)
Epoch 10: Test set Average loss: 0.7892, Accuracy: 2206/3000 (74%)
Epoch 15: Test set Average loss: 0.7181, Accuracy: 2324/3000 (77%)
Epoch 20: Test set Average loss: 0.6487, Accuracy: 2392/3000 (80%)
Epoch 25: Test set Average loss: 0.6395, Accuracy: 2440/3000 (81%)
Epoch 30: Test set Average loss: 0.5480, Accuracy: 2527/3000 (84%)
Epoch 35: Test set Average loss: 0.4794, Accuracy: 2585/3000 (86%)
Epoch 40: Test set Average loss: 0.4948, Accuracy: 2574/3000 (86%)
Epoch 45: Test set Average loss: 0.6159, Accuracy: 2513/3000 (84%)
Epoch 50: Test set Average loss: 0.5978, Accuracy: 2518/3000 (84%)
Epoch 55: Test set Average loss: 0.5154, Accuracy: 2588/3000 (86%)
Epoch 60: Test set Average loss: 0.6380, Accuracy: 2499/3000 (83%)
Epoch 65: Test set Average loss: 0.6038, Accuracy: 2553/3000 (85%)
Epoch 70: Test set Average loss: 0.5155, Accuracy: 2603/3000 (87%)
Epoch 75: Test set Average loss: 0.6733, Accuracy: 2498/3000 (8

In [42]:
np.argmax(model(data).cpu().detach().numpy(), axis = 1)

array([4, 6, 9, ..., 7, 5, 4])

In [43]:
outputs = []
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        outputs.append(np.argmax(model(data).cpu().detach().numpy(), axis = 1))

In [70]:
model

CNN(
  (dropout): Dropout(p=0.4, inplace=False)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv1): Conv2d(1, 5, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(5, 30, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=5070, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=20, bias=True)
  (activation): LogSoftmax(dim=1)
)