In [7]:
import cv2
import numpy as np


將輸入圖片進行處理
- input_data 是Lenna的灰階照片
- filter_h 卷積核的高
- filter_w 卷積核的長
- stride 步幅
- pad 填充

In [8]:
# N = 檔案數量 ,C = 通道數 ,H = 高 ,W = 長
def im2col(input_data, filter_h, filter_w, stride=1, pad=1):
    N, C, H, W = input_data.shape  
    # 運算子 // 取商值
    # 因為卷積核上下左右都是圖片會被切掉的地方，所以原大小先除掉卷積核高與長
    # 再除1步長，最後再加卷積核中間的1
    out_h = (H + 2*pad - filter_h)//stride + 1  # 輸出資料的高
    out_w = (W + 2*pad - filter_w)//stride + 1  # 輸出資料的長
    # 填充 H,W
    # 想了解np.pad部分可以看 np_pad矩陣填充.ipynb
    img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant')
    # (N, C, filter_h, filter_w, out_h, out_w)的0矩陣
    col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))
    
    for y in range(filter_h):
        y_max = y + stride*out_h
        for x in range(filter_w):
            x_max = x + stride*out_w
            col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
    # 按(0, 4, 5, 1, 2, 3)順序，交換col的列，然後改變形狀
    col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)
    return col

In [9]:
def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
    N, C, H, W = input_shape
    out_h = (H + 2*pad - filter_h)//stride + 1
    out_w = (W + 2*pad - filter_w)//stride + 1
    col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)

    img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
    for y in range(filter_h):
        y_max = y + stride*out_h
        for x in range(filter_w):
            x_max = x + stride*out_w
            img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]

    return img[:, :, pad:H + pad, pad:W + pad]

在這裏學到了在矩陣中提取矩陣,再透由RGB 轉 YIQ的方式
- Y是提供黑白電視及彩色電視的亮度信號（Luminance），即亮度（Brightness)
- I代表In-phase，色彩從橙色到青色
- Q代表Quadrature-phase，色彩從紫色到黃綠色

In [10]:
Photo = cv2.imread(r'/Users/yeshiouwei/Desktop/影像辨識/Lenna.jpg')
a =[[0.299,0.587,0.114],[0.596,-0.275,-0.321],[0.212,-0.523,0.311]]
a = np.array(a)
for x in range(316):
    a = (Photo[x][0])[0]
    b = (Photo[x][0])[1]
    c = (Photo[x][0])[2]
    Photo[x][0] = 0.299 * a + 0.587 * b + 0.114 * c
    #print(Photo[x][0])
    for y in range(316):
        a = (Photo[x][y])[0]
        b = (Photo[x][y])[1]
        c = (Photo[x][y])[2]
        Photo[x][y] = 0.299 * a + 0.587 * b + 0.114 * c
cv2.imwrite(r'/Users/yeshiouwei/Desktop/影像辨識/Lenna_gray.jpg', Photo)
print(Photo.shape)

(316, 316, 3)


In [14]:
in_Photo = cv2.imread(r'/Users/yeshiouwei/Desktop/影像辨識/Lenna_gray.jpg')
in_Photo_4D = np.reshape(in_Photo,(1,3,316,316))
print(in_Photo_4D.shape)
print(im2col(in_Photo_4D,3,3,1,1).shape)
B = im2col(in_Photo_4D,3,3,1,1)
C = np.array([1/16,2/16,1/16,2/16,4/16,2/16,1/16,2/16,1/16,
              1/16,2/16,1/16,2/16,4/16,2/16,1/16,2/16,1/16,
              1/16,2/16,1/16,2/16,4/16,2/16,1/16,2/16,1/16])
print(C.shape)
D = np.dot(B,C)
print(B)
E = np.reshape(D,(316,316,1))
cv2.imwrite(r'/Users/yeshiouwei/Desktop/影像辨識/Lenna_Convolution.jpg', E)

(1, 3, 316, 316)
(99856, 27)
(27,)
[[  0.   0.   0. ...   0.  63.  63.]
 [  0.   0.   0. ...  63.  63.  63.]
 [  0.   0.   0. ...  63.  63.  59.]
 ...
 [141. 139. 139. ...   0.   0.   0.]
 [139. 139. 139. ...   0.   0.   0.]
 [139. 139.   0. ...   0.   0.   0.]]


True

In [None]:
class Convolution:
	# 初始化權重（卷積核4維）、偏置、步幅、填充
    def __init__(self, W, b, stride=1, pad=0):
        self.W = W
        self.b = b
        self.stride = stride
        self.pad = pad
        
        # 中間資料（backward時使用）
        self.x = None   
        self.col = None
        self.col_W = None
        
        # 權重和偏置引數的梯度
        self.dW = None
        self.db = None

    def forward(self, x):
        # 卷積核大小
        FN, C, FH, FW = self.W.shape
        # 資料資料大小
        N, C, H, W = x.shape
        # 計算輸出資料大小
        out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
        out_w = 1 + int((W + 2*self.pad - FW) / self.stride)
        # 利用im2col轉換為行
        col = im2col(x, FH, FW, self.stride, self.pad)
        # 卷積核轉換為列，展開為2維陣列
        col_W = self.W.reshape(FN, -1).T
        # 計算正向傳播
        out = np.dot(col, col_W) + self.b
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)

        self.x = x
        self.col = col
        self.col_W = col_W

        return out

    def backward(self, dout):
        # 卷積核大小
        FN, C, FH, FW = self.W.shape
        dout = dout.transpose(0,2,3,1).reshape(-1, FN)

        self.db = np.sum(dout, axis=0)
        self.dW = np.dot(self.col.T, dout)
        self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)

        dcol = np.dot(dout, self.col_W.T)
        # 逆轉換
        dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)

        return dx

In [None]:
class Pooling:
    def __init__(self, pool_h, pool_w, stride=1, pad=0):
        self.pool_h = pool_h
        self.pool_w = pool_w
        self.stride = stride
        self.pad = pad
        
        self.x = None
        self.arg_max = None

    def forward(self, x):
        N, C, H, W = x.shape
        out_h = int(1 + (H - self.pool_h) / self.stride)
        out_w = int(1 + (W - self.pool_w) / self.stride)
		# 展開
        col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
        col = col.reshape(-1, self.pool_h*self.pool_w)
		# 最大值
        arg_max = np.argmax(col, axis=1)
        out = np.max(col, axis=1)
        # 轉換
        out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)

        self.x = x
        self.arg_max = arg_max

        return out

    def backward(self, dout):
        dout = dout.transpose(0, 2, 3, 1)
        
        pool_size = self.pool_h * self.pool_w
        dmax = np.zeros((dout.size, pool_size))
        dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()
        dmax = dmax.reshape(dout.shape + (pool_size,)) 
        
        dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
        dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)
        
        return dx