In [None]:
"""
HW7的任務是模型壓縮 - Neural Network Compression。

Compression有很多種門派，在這裡我們會介紹上課出現過的其中四種，分別是:

知識蒸餾 Knowledge Distillation
網路剪枝 Network Pruning
用少量參數來做CNN Architecture Design
參數量化 Weight Quantization
在這個notebook中我們會介紹MobileNet v1的Architecture Design。
"""

# Architecture Design

## Depthwise & Pointwise Convolution
![](https://i.imgur.com/FBgcA0s.png)
> 藍色為上下層Channel的關係，綠色則為該Receptive Field的擴張。
> (圖片引用自arxiv:1810.04231)

(a) 就是一般的Convolution Layer，所以他的Weight連接方式會跟Fully Connected一樣，只差在原本在FC是用數字相乘後相加，Convolution Layer是圖片卷積後相加。

(b) DW(Depthwise Convolution Layer)你可以想像成一張feature map各自過**一個filter**處理後，再用PW(Pointwise Convolution Layer)把所有feature map的單個pixel資訊合在一起(就是1個pixel的Fully Connected Layer)。

(c) GC(Group Convolution Layer)就是把feature map分組，讓他們自己過Convolution Layer後再重新Concat起來。算是一般的Convolution和Depthwise Convolution的折衷版。**所以說，Group Convolution的Group=Input Feautures數就會是Depthwise Convolution(因為每個Channel都各自獨立)，Group=1就會是一般的Convolution(因為就等於沒有Group)。**

<img src="https://i.imgur.com/Hqhg0Q9.png" width="500px">


## 實作細節
```python
# 一般的Convolution, weight大小 = in_chs * out_chs * kernel_size^2
nn.Conv2d(in_chs, out_chs, kernel_size, stride, padding)

# Group Convolution, Group數目可以自行控制，表示要分成幾群。其中in_chs和out_chs必須要可以被groups整除。(不然沒辦法分群。)
nn.Conv2d(in_chs, out_chs, kernel_size, stride, padding, groups=groups)

# Depthwise Convolution, 輸入chs=輸出chs=Groups數目, weight大小 = in_chs * kernel_size^2
nn.Conv2d(in_chs, out_chs=in_chs, kernel_size, stride, padding, groups=in_chs)

# Pointwise Convolution, 也就是1 by 1 convolution, weight大小 = in_chs * out_chs
nn.Conv2d(in_chs, out_chs, 1)
```



In [2]:
#Architecture Design
#model
import torch.nn as nn  
import torch.nn.functional as F
import torch 
import os 
import numpy as np 
import cv2

In [None]:
class StudentNet(nn.Module):
    '''
      在這個Net裡面，我們會使用Depthwise & Pointwise Convolution Layer來疊model。
      你會發現，將原本的Convolution Layer換成Dw & Pw後，Accuracy通常不會降很多。

      另外，取名為StudentNet是因為這個Model等會要做Knowledge Distillation。
    '''
    def __init__(self,base = 16,width_mult = 1):
        super(StudentNet,self).__init__()
        multiplier = [1,2,4,8,16,16,16,16]
        bandwidth =[base*m for m in multiplier]

        for i in range(3,7):
            bandwidth[i] = int(bandwidth[i]* width_mult)
        
        self.cnn = nn.Sequential(
            nn.Sequential(
            nn.Conv2d(3,bandwidth[0],3,1,1),
            nn.BatchNorm2d(bandwidth[0]),
            nn.ReLU6(),
            nn.MaxPool2d(2,2,0),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[0],bandwidth[0],3,1,1),
                nn.BatchNorm2d(bandwidth[0]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[0],bandwidth[1],1),
                nn.MaxPool2d(2,2,0),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[1], bandwidth[1], 3, 1, 1, groups=bandwidth[1]),
                nn.BatchNorm2d(bandwidth[1]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[1], bandwidth[2], 1),
                nn.MaxPool2d(2, 2, 0),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[2], bandwidth[2], 3, 1, 1, groups=bandwidth[2]),
                nn.BatchNorm2d(bandwidth[2]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[2], bandwidth[3], 1),
                nn.MaxPool2d(2, 2, 0),
            ),
            #bandwidth 16
            nn.Sequential(
                nn.Conv2d(bandwidth[3], bandwidth[3], 3, 1, 1, groups=bandwidth[3]),
                nn.BatchNorm2d(bandwidth[3]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[3], bandwidth[4], 1),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[4], bandwidth[4], 3, 1, 1, groups=bandwidth[4]),
                nn.BatchNorm2d(bandwidth[4]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[4], bandwidth[5], 1),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[5], bandwidth[5], 3, 1, 1, groups=bandwidth[5]),
                nn.BatchNorm2d(bandwidth[5]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[5], bandwidth[6], 1),
            ),
            nn.Sequential(
                nn.Conv2d(bandwidth[6], bandwidth[6], 3, 1, 1, groups=bandwidth[6]),
                nn.BatchNorm2d(bandwidth[6]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[6], bandwidth[7], 1),
            ),
            # 這邊我們採用Global Average Pooling。
            # 如果輸入圖片大小不一樣的話，就會因為Global Average Pooling壓成一樣的形狀，這樣子接下來做FC就不會對不起來。
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.fc = nn.Sequential(
            nn.Linear(bandwidth[7],11),
        )
    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)


In [None]:
def readfile(path,label):
    image_dir = sorted(os.listdir(path))
    x = np.zeros((len(image_dir),128,128,3),dtype=np.uint8)
    y = np.zeros((len(image_dir)),dtype=np.uint8)
    for i,file in enumerate(image_dir):
        img = cv2.imread(os.path.join(path,file))
        x[i,:,:] = cv2.resize(img,(128,128))
        if label:
            y[i] = int(file.split("_")[0])
    if label:
        return x,y 
    else:
        return x

In [None]:
workspace_dir = '../HW_3/data'
print("reading data...")
train_x,train_y = readfile(os.path.join(workspace_dir,"training"),True)
print("Size of training data  = {}".format(len(train_x)))
val_x,val_y = readfile(os.path.join(workspace_dir,"validation"),True)
print("Size of validation data  = {}".format(len(val_x)))
test_x= readfile(os.path.join(workspace_dir,"testing"),False)
print("Size of Testing data = {}".format(len(test_x)))
print("Over")

In [None]:
# training 時做 data augmentation
import torchvision.transforms as transforms
from torch.utils.data import DataLoader,Dataset
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(), # 隨機將圖片水平翻轉
    transforms.RandomRotation(15), # 隨機旋轉圖片
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
])
# testing 時不需做 data augmentation
test_transform = transforms.Compose([
    transforms.ToPILImage(),                                    
    transforms.ToTensor(),
])
class ImgDataset(Dataset):
    def __init__(self, x, y=None, transform=None):
        self.x = x
        # label is required to be a LongTensor
        self.y = y
        if y is not None:
            self.y = torch.LongTensor(y)
        self.transform = transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        X = self.x[index]
        if self.transform is not None:
            X = self.transform(X)
        if self.y is not None:
            Y = self.y[index]
            return X, Y
        else:
            return X

In [None]:
batch_size=4
train_set = ImgDataset(train_x,train_y,train_transform)
val_set = ImgDataset(val_x,val_y,test_transform)
train_loader = DataLoader(train_set,batch_size = batch_size,shuffle=True)
val_loader = DataLoader(val_set,batch_size=batch_size,shuffle=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StudentNet().to(device)
cirection = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [None]:
#train
#train
# model.train()
import time
epochs = 30
for epoch in range(epochs):
    epoch_start_time = time.time()
    train_acc =0.0
    val_acc =0.0
    train_loss = 0.0
    val_loss =0.0
    model.train()
    for i,data in enumerate(train_loader):
        optimizer.zero_grad()
        x,y = data[0].to(device),data[1].to(device)
        # print(x.shape)
        y_pred = model(x)
        loss = cirection(y_pred,y.long())
        loss.backward()
        optimizer.step()
        train_acc +=np.sum(np.argmax(y_pred.cpu().data.numpy(),axis=1)== y.cpu().numpy())
        train_loss +=loss.item()
    model.eval()
    with torch.no_grad():
        for i,data in enumerate(val_loader):
            valx ,valy = data[0].to(device),data[1].to(device)
            val_pred = model(valx)
            batch_loss = cirection(val_pred,valy.long())
            val_acc +=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)== valy.cpu().numpy())
            val_loss +=batch_loss.item()

        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, epochs, time.time()-epoch_start_time, \
             train_acc/train_set.__len__(), train_loss/train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))


In [None]:
#train and val 共同训练
train_val_x = np.concatenate((train_x,val_x),axis=0)
train_val_y = np.concatenate((train_y,val_y),axis=0)
train_val_set =ImgDataset(train_val_x,train_val_y,train_transform)
train_val_loader = DataLoader(train_val_set,batch_size=batch_size,shuffle=True)

In [None]:
model_best = StudentNet.to(device)
epochs = 30
for epoch in range(epochs):
    epoch_start_time = time.time()
    train_acc =0.0
    val_acc =0.0
    train_loss = 0.0
    val_loss =0.0
    model.train()
    for i,data in enumerate(train_loader):
        optimizer.zero_grad()
        x,y = data[0].to(device),data[1].to(device)
        y_pred = model_best(x)
        loss = cirection(y_pred,y.long())
        train_loss +=loss
        loss.backward()
        optimizer.step()
        train_acc +=np.sum(np.argmax(y_pred.cpu().data.numpy(),axis=1)== y.cpu().numpy())
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f '
    (epoch + 1, epochs, time.time()-epoch_start_time, \
        train_acc/train_set.__len__(), train_loss/train_set.__len__())

torch.save('studentnet.pth')