In [3]:
import pandas as pd
import numpy as np
import sys

In [4]:
def oneline_log(text):
    sys.stdout.write('\r')
    sys.stdout.write(text)

In [5]:
class LogisticRegression():
    def __init__(self):
        super(LogisticRegression, self).__init__()

    def linear(self, x, w, b):

        return w * x.T + b

    def sigmoid(self, x):

        x = np.clip(x, -709.78, 709.78)

        return 1 / (1 + np.exp(-x))

    def forward(self, x, w, b):
        x = np.mat(x)
        w = np.mat(w)
        # print(f'x shape: {np.shape(x)}')
        # print(f'w shape: {np.shape(w)}')
        net_input = self.linear(x, w, b)
        # print(f'net_input: {net_input}')
        y_estimate = self.sigmoid(net_input)

        return y_estimate


model = LogisticRegression()

In [6]:
# CrossEntropy
class BinaryCrossEntropy():
    def __init__(self):
        super(BinaryCrossEntropy, self).__init__()
    
    def cross_entropy(self, y_pred, target):
        x = target*np.log(y_pred) + (1-target)*np.log(1-y_pred)

        return -(np.mean(x))

    def forward(self, y_pred, target):

        return self.cross_entropy(y_pred, target)
criterion = BinaryCrossEntropy()

In [7]:
# Error Function: Cross-entropy loss
# used to calculate the loss of estimate
# a: estimate value of y
# y: true value of y
def calculate_cross_entropy(y, a):
    return -np.nan_to_num(np.multiply(y, np.log(a)) + np.multiply((1-y), np.log(1-a))).mean()

In [8]:
# generate random weight for each layer
# number of weight = input * neuron of next layer

# return value is 2D array of weight w_ji 
# w_ji means for the j neuron, the weight of input i
random_scalar = 100
def generate_layer_weight(seed, neuron, input):
    np.random.seed(seed) # set seed for weight random
    # w_ji 其中 j 对应 neuron, i 对应 input，所以 reshape 也同样按照如此进行
    weight = np.random.randn(neuron,input) / random_scalar
    return weight

In [9]:
# generate random bias for each layer
# number of bias = neuron of layer

# return value is vector of bias
def generate_layer_bias(seed, neuron):
    np.random.seed(seed) # set seed for bias random
    bias = np.random.randn(neuron, 1) / random_scalar
    return bias

In [10]:
def one_hot(value):
    if value == 9:
        return np.array([0,0,0,1])
    elif value == 8:
        return np.array([0,0,1,0])
    elif value == 3:
        return np.array([0,1,0,0])
    elif value == 0:
        return np.array([1,0,0,0])

In [11]:
# print('================== Start ==================')
pd_train_origin = pd.read_csv('data/lab3_train.csv')

In [12]:
# 保留 label 为 0、3、8、9 的 data
# 用 drop() 的方法，参见: https://www.cnblogs.com/everfight/p/pandas_condition_remove.html
pd_train_origin = pd_train_origin[(pd_train_origin.label == 0) 
                                | (pd_train_origin.label == 3) 
                                | (pd_train_origin.label == 8) 
                                | (pd_train_origin.label == 9) ]

In [13]:
pd_train = pd_train_origin.sample(frac=0.9, random_state=2)
pd_validate = pd_train_origin.drop(index=pd_train.index)

In [14]:
pd_validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 1 to 15994
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 19.2 MB


In [15]:
pd_train.info()
pd_train.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12800 entries, 464 to 13599
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 76.8 MB


Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
464,9,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,255,99,0,0,0
7199,3,0,0,0,0,0,0,0,0,0,...,202,19,0,5,0,0,0,0,0,0
3303,0,0,0,0,0,0,0,0,0,0,...,158,147,199,72,0,3,0,0,0,0


In [16]:
train_feature = pd_train.drop(['label'], axis=1)
train_feature = train_feature / 255
train_target = pd.DataFrame(pd_train.label)
train_feature = np.array(train_feature)
train_target = np.array(train_target)

validate_feature = pd_validate.drop(['label'], axis=1)
validate_feature = validate_feature / 255
validate_target = pd.DataFrame(pd_validate.label)
validate_feature = np.array(validate_feature)
validate_target = np.array(validate_target)

In [17]:
# 一共 len(train_feature) 笔 data
# 每一笔 data 有 784 个 pixel

w = {} # weight of layers
b = {} # bias of layers
learning_rate = 0.0001

In [18]:
# weight for hidden layers
layer_neuron = [32, 4]
for i, neuron in enumerate(layer_neuron):
    input_size = 784 if i == 0 else layer_neuron[i-1]
    w[i+1] = generate_layer_weight(seed=1, neuron=neuron, input=input_size)
    print(f'layer {i}, weight shape: {np.shape(w[i+1])}')
    
    b[i+1] = generate_layer_bias(seed=1, neuron=neuron)
    print(f'layer {i}, bias shape: {np.shape(b[i+1])}')

layer 0, weight shape: (32, 784)
layer 0, bias shape: (32, 1)
layer 1, weight shape: (4, 32)
layer 1, bias shape: (4, 1)


In [19]:
epoch_num = 130
validate_size = len(validate_target)
train_size = len(train_target)
best_w = {} # weight of layers
best_b = {} # bias of layers
best_epoch = 0
best_train_loss = 0
best_validate_loss = 0
max_acc = 0

for epoch in range(epoch_num):
    train_loss_sum = 0
    for i, feature_data in enumerate(train_feature):
        # 第 i 笔 data 的 feature
        a = {} # output of layers
        error = {} # error of layers
        a[0] = feature_data

        # Forward
        for layer, neuron in enumerate(layer_neuron):
            output = model.forward(a[layer], w[layer+1], b[layer+1])            
            a[layer+1] = np.array(output.reshape(1,-1))[0]

        # Backward
        y = one_hot(train_target[i][0])
        loss_estimate = a[len(layer_neuron)]
        train_loss_sum += criterion.forward(loss_estimate, y)
        
        error[len(layer_neuron)] = np.mat(loss_estimate - y).T

        for layer in range(len(layer_neuron) - 1, -1, -1):
            # print(f'layer: {layer}')
            left = np.mat(w[layer+1]).T
            right = error[layer+1] * np.dot( a[layer], 1-a[layer])
            error[layer] = np.dot(left , right)
            # print(f'error {layer}: {error[layer]}')

        # Update parameter
        for layer in range(1, len(layer_neuron)+1):
            dw = np.dot(error[layer] , np.mat(a[layer-1]))
            w[layer] -= learning_rate * dw
            b[layer] -= learning_rate * error[layer]

    train_loss = train_loss_sum / train_size

    if (epoch+1) % 10 == 0:
        acc_count = 0
        validate_loss_sum = 0
        
        for i, feature_data in enumerate(validate_feature):
            a = {} # output of layers
            error = {} # error of layers
            a[0] = feature_data

            # Forward
            for layer, neuron in enumerate(layer_neuron):
                output = model.forward(a[layer], w[layer+1], b[layer+1])            
                a[layer+1] = np.array(output.reshape(1,-1))[0]

            arr = a[len(layer_neuron)]

            y = one_hot(validate_target[i][0])
         
            for i, data in enumerate(y):
                if data == 1 and arr[i] == np.max(arr):
                    acc_count += 1
            # Backward
            loss_estimate = a[len(layer_neuron)]
            validate_loss_sum += criterion.forward(loss_estimate, y)
        
        validate_loss = validate_loss_sum / validate_size
        if(max_acc < acc_count):
            max_acc = acc_count
            best_b = b
            best_w = w
            best_epoch = epoch
            best_train_loss = train_loss
            best_validate_loss = validate_loss
        print(f'epoch {epoch + 1}: train_loss = {train_loss}, validate_loss = {validate_loss}, acc = {acc_count/validate_size}')
       

epoch 10: train_loss = 0.18353670922257798, validate_loss = 0.1749036753538387, acc = 0.9265625
epoch 20: train_loss = 0.10803005294737736, validate_loss = 0.10485830698539612, acc = 0.935625
epoch 30: train_loss = 0.0893320216368268, validate_loss = 0.08548196457166247, acc = 0.9475
epoch 40: train_loss = 0.08121930007980244, validate_loss = 0.07711239091319053, acc = 0.951875
epoch 50: train_loss = 0.07651736790516547, validate_loss = 0.07259174186739042, acc = 0.9540625
epoch 60: train_loss = 0.07329841873273554, validate_loss = 0.06971562050010391, acc = 0.95625
epoch 70: train_loss = 0.0708423265217568, validate_loss = 0.06769326875701244, acc = 0.9559375
epoch 80: train_loss = 0.06883384780876613, validate_loss = 0.06618677201152283, acc = 0.9565625
epoch 90: train_loss = 0.0671169417066069, validate_loss = 0.0650226784274705, acc = 0.95625
epoch 100: train_loss = 0.0656080679585435, validate_loss = 0.0641055792563055, acc = 0.955625
epoch 110: train_loss = 0.0642645718084193, va

In [20]:
print(f'setting: layer={len(layer_neuron)}, neuron={layer_neuron}')
print(f'best result at epoch {best_epoch + 1}: train_loss = {best_train_loss}, validate_loss = {best_validate_loss}, acc = {max_acc/validate_size}')

setting: layer=2, neuron=[32, 4]
best result at epoch 130: train_loss = 0.06198728100357578, validate_loss = 0.06236801608016579, acc = 0.9571875


In [36]:
pd_test_origin = pd.read_csv('data/lab3_test.csv')
pd_test_origin = pd_test_origin / 255
pd_test_origin = np.array(pd_test_origin)
pd_test_origin

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [40]:
ans = []
for i, feature_data in enumerate(pd_test_origin):
    a = {} # output of layers
    error = {} # error of layers
    a[0] = feature_data

    # Forward
    for layer, neuron in enumerate(layer_neuron):
        output = model.forward(a[layer], best_w[layer+1], best_b[layer+1])            
        a[layer+1] = np.array(output.reshape(1,-1))[0]

    arr = a[len(layer_neuron)]

    for i, data in enumerate(arr):
        if data == np.max(arr):
            if i == 0:
                ans.append(0)
            elif i == 1:
                ans.append(3)
            elif i == 2:
                ans.append(8)
            elif i == 3:
                ans.append(9)      

In [41]:
ans

[9,
 0,
 9,
 8,
 8,
 3,
 3,
 9,
 3,
 8,
 3,
 0,
 3,
 3,
 8,
 9,
 9,
 9,
 0,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 8,
 8,
 8,
 9,
 8,
 8,
 9,
 9,
 3,
 9,
 9,
 9,
 9,
 9,
 3,
 3,
 9,
 3,
 9,
 0,
 8,
 3,
 8,
 9,
 8,
 9,
 8,
 3,
 3,
 9,
 9,
 9,
 9,
 8,
 9,
 0,
 8,
 9,
 9,
 9,
 8,
 8,
 3,
 8,
 9,
 9,
 0,
 0,
 9,
 3,
 3,
 3,
 3,
 8,
 9,
 3,
 3,
 9,
 8,
 8,
 9,
 9,
 9,
 9,
 3,
 0,
 9,
 9,
 0,
 3,
 3,
 0,
 3,
 9,
 0,
 3,
 0,
 9,
 8,
 9,
 0,
 3,
 9,
 0,
 0,
 0,
 0,
 3,
 9,
 0,
 0,
 9,
 0,
 8,
 9,
 3,
 8,
 3,
 8,
 0,
 8,
 3,
 0,
 0,
 3,
 8,
 0,
 8,
 3,
 0,
 9,
 8,
 8,
 0,
 9,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 3,
 0,
 0,
 9,
 0,
 3,
 0,
 0,
 3,
 8,
 8,
 0,
 8,
 9,
 9,
 8,
 0,
 9,
 8,
 8,
 9,
 8,
 9,
 3,
 8,
 0,
 3,
 9,
 0,
 0,
 0,
 3,
 8,
 8,
 0,
 9,
 0,
 9,
 9,
 3,
 9,
 9,
 3,
 0,
 9,
 8,
 9,
 3,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 0,
 8,
 3,
 8,
 3,
 9,
 9,
 8,
 0,
 3,
 0,
 3,
 8,
 3,
 0,
 9,
 9,
 8,
 3,
 8,
 3,
 8,
 9,
 0,
 8,
 3,
 3,
 0,
 3,
 8,
 8,
 0,
 8,
 0,
 0,
 0,
 3,
 0,
 0,
 3,
 9,
 9,
 0,


In [43]:
test_ans = pd.DataFrame(ans)
test_ans = test_ans.rename({0:'ans'},axis=1)
test_ans.to_csv('test_ans.csv', index=None)