In [148]:
import pandas as pd
import numpy as np
import sys

In [149]:
def oneline_log(text):
    sys.stdout.write('\r')
    sys.stdout.write(text)

In [150]:
class LogisticRegression():
    def __init__(self):
        super(LogisticRegression, self).__init__()

    def linear(self, x, w, b):

        return w * x.T + b

    def sigmoid(self, x):

        x = np.clip(x, -709.78, 709.78)

        return 1 / (1 + np.exp(-x))

    def forward(self, x, w, b):
        x = np.mat(x)
        w = np.mat(w)
        # print(f'x shape: {np.shape(x)}')
        # print(f'w shape: {np.shape(w)}')
        net_input = self.linear(x, w, b)
        # print(f'net_input: {net_input}')
        y_estimate = self.sigmoid(net_input)

        return y_estimate


model = LogisticRegression()

In [151]:
# CrossEntropy
class BinaryCrossEntropy():
    def __init__(self):
        super(BinaryCrossEntropy, self).__init__()
    
    def cross_entropy(self, y_pred, target):
        x = target*np.log(y_pred) + (1-target)*np.log(1-y_pred)

        return -(np.mean(x))

    def forward(self, y_pred, target):

        return self.cross_entropy(y_pred, target)
criterion = BinaryCrossEntropy()

In [152]:
# Error Function: Cross-entropy loss
# used to calculate the loss of estimate
# a: estimate value of y
# y: true value of y
def calculate_cross_entropy(y, a):
    return -np.nan_to_num(np.multiply(y, np.log(a)) + np.multiply((1-y), np.log(1-a))).mean()

In [153]:
# generate random weight for each layer
# number of weight = input * neuron of next layer

# return value is 2D array of weight w_ji 
# w_ji means for the j neuron, the weight of input i
random_scalar = 100
def generate_layer_weight(seed, neuron, input):
    np.random.seed(seed) # set seed for weight random
    # w_ji 其中 j 对应 neuron, i 对应 input，所以 reshape 也同样按照如此进行
    weight = np.random.randn(neuron,input) / random_scalar
    # weight = np.zeros((neuron,input))
    return weight

In [154]:
# generate random bias for each layer
# number of bias = neuron of layer

# return value is vector of bias
def generate_layer_bias(seed, neuron):
    np.random.seed(seed) # set seed for bias random
    bias = np.random.randn(neuron, 1) / random_scalar
    # bias = np.zeros((neuron,1))
    return bias

In [155]:
def one_hot(value):
    if value == 9:
        return np.array([0,0,0,1])
    elif value == 8:
        return np.array([0,0,1,0])
    elif value == 3:
        return np.array([0,1,0,0])
    elif value == 0:
        return np.array([1,0,0,0])

In [156]:
# print('================== Start ==================')
pd_train_origin = pd.read_csv('data/lab3_train.csv')

In [157]:
# 保留 label 为 0、3、8、9 的 data
# 用 drop() 的方法，参见: https://www.cnblogs.com/everfight/p/pandas_condition_remove.html
pd_train_origin = pd_train_origin[(pd_train_origin.label == 0) 
                                | (pd_train_origin.label == 3) 
                                | (pd_train_origin.label == 8) 
                                | (pd_train_origin.label == 9) ]

In [158]:
pd_train = pd_train_origin.sample(frac=0.8, random_state=2)
pd_validate = pd_train_origin.drop(index=pd_train.index)

In [159]:
pd_validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 1 to 15994
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 19.2 MB


In [160]:
pd_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12800 entries, 464 to 13599
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 76.8 MB


In [161]:
train_feature = pd_train.drop(['label'], axis=1)
train_feature = train_feature / 255
train_target = pd.DataFrame(pd_train.label)
train_feature = np.array(train_feature)
train_target = np.array(train_target)

validate_feature = pd_validate.drop(['label'], axis=1)
validate_feature = validate_feature / 255
validate_target = pd.DataFrame(pd_validate.label)
validate_feature = np.array(validate_feature)
validate_target = np.array(validate_target)

In [162]:
# 一共 len(train_feature) 笔 data
# 每一笔 data 有 784 个 pixel

w = {} # weight of layers
b = {} # bias of layers

In [163]:
# weight for hidden layers
layer_neuron = [4]
all_layer_neuron = [784]
all_layer_neuron.extend(layer_neuron)
print(all_layer_neuron)
for i, neuron in enumerate(layer_neuron):
    input_size = 784 if i == 0 else layer_neuron[i-1]
    w[i+1] = generate_layer_weight(seed=2, neuron=neuron, input=input_size)
    print(f'layer {i}, weight shape: {np.shape(w[i+1])}')
    # print(w[i+1])
    b[i+1] = generate_layer_bias(seed=2, neuron=neuron)
    print(f'layer {i}, bias shape: {np.shape(b[i+1])}')

[784, 4]
layer 0, weight shape: (4, 784)
layer 0, bias shape: (4, 1)


In [164]:
epoch_num = 1000
learning_rate = 0.002
validate_size = len(validate_target)
train_size = len(train_target)
best_w = {} # weight of layers
best_b = {} # bias of layers
best_epoch = 0
best_train_loss = 0
best_validate_loss = 0
max_train_acc = 0
max_validate_acc = 0
overfit_threshold = 0.005 # 如果 acc 比 max_validate_acc 小 overfit_threshold 的话
is_overfit = False

In [165]:
for epoch in range(epoch_num):
    train_loss_sum = 0
    oneline_log(f'epoch {epoch + 1}')
    train_acc_count = 0
    for i, feature_data in enumerate(train_feature):
        # 第 i 笔 data 的 feature
        a = {} # output of layers
        error = {} # error of layers
        a[0] = feature_data

        # Forward
        for layer, neuron in enumerate(layer_neuron):
            output = model.forward(a[layer], w[layer+1], b[layer+1])            
            a[layer+1] = np.array(output.reshape(1,-1))[0]

        # Backward
        y = one_hot(train_target[i][0])
        loss_estimate = a[len(layer_neuron)]
        train_loss_sum += criterion.forward(loss_estimate, y)
        arr = a[len(layer_neuron)]

        for i, data in enumerate(y):
            if data == 1 and arr[i] == np.max(arr):
                train_acc_count += 1
        
        error[len(layer_neuron)] = np.mat(loss_estimate - y).T

        for layer in range(len(layer_neuron) - 1, -1, -1):
            # print(f'layer: {layer}')
            left = np.mat(w[layer+1]).T
            right = error[layer+1] * np.dot( a[layer], 1-a[layer])
            error[layer] = np.dot(left , right)
            # print(f'error {layer}: {error[layer]}')

        # Update parameter
        for layer in range(1, len(layer_neuron)+1):
            dw = np.dot(error[layer] , np.mat(a[layer-1]))
            w[layer] -= learning_rate * dw
            b[layer] -= learning_rate * error[layer]

    train_loss = train_loss_sum / train_size
    
    if (epoch+1) % 1 == 0:
        validate_acc_count = 0
        validate_loss_sum = 0
        
        for i, feature_data in enumerate(validate_feature):
            a = {} # output of layers
            error = {} # error of layers
            a[0] = feature_data

            # Forward
            for layer, neuron in enumerate(layer_neuron):
                output = model.forward(a[layer], w[layer+1], b[layer+1])            
                a[layer+1] = np.array(output.reshape(1,-1))[0]

            arr = a[len(layer_neuron)]

            y = one_hot(validate_target[i][0])
         
            for i, data in enumerate(y):
                if data == 1 and arr[i] == np.max(arr):
                    validate_acc_count += 1
            # Backward
            loss_estimate = a[len(layer_neuron)]
            validate_loss_sum += criterion.forward(loss_estimate, y)
        
        validate_loss = validate_loss_sum / validate_size
        if max_train_acc < train_acc_count:
            max_train_acc = train_acc_count
            best_train_loss = train_loss
        if max_validate_acc < validate_acc_count:
            max_validate_acc = validate_acc_count
            best_validate_loss = validate_loss
            best_b = b
            best_w = w
            best_epoch = epoch
        oneline_log('')
        print(f'epoch {epoch + 1}: max_validate_acc = {max_validate_acc/validate_size}, train_acc = {train_acc_count / train_size}, train_loss = {train_loss}, validate_loss = {validate_loss}, validate_acc = {validate_acc_count/validate_size}')
    
    if max_validate_acc/validate_size - validate_acc_count/validate_size > overfit_threshold and epoch > 100:
        is_overfit = True
        print('========== stop because overfitting ==========')
        break

epoch 1: max_validate_acc = 0.945625, train_acc = 0.930234375, train_loss = 0.11835313905343266, validate_loss = 0.08244715217030026, validate_acc = 0.945625
epoch 2: max_validate_acc = 0.951875, train_acc = 0.945, train_loss = 0.08409085795343635, validate_loss = 0.07395611992852166, validate_acc = 0.951875
epoch 3: max_validate_acc = 0.9546875, train_acc = 0.95, train_loss = 0.07830884057306053, validate_loss = 0.07021348527760697, validate_acc = 0.9546875
epoch 4: max_validate_acc = 0.955, train_acc = 0.952578125, train_loss = 0.07503021777933609, validate_loss = 0.06794753592404668, validate_acc = 0.955
epoch 5: max_validate_acc = 0.9559375, train_acc = 0.953359375, train_loss = 0.07274725100260906, validate_loss = 0.06638738796176825, validate_acc = 0.9559375
epoch 6: max_validate_acc = 0.958125, train_acc = 0.954765625, train_loss = 0.07100631363778409, validate_loss = 0.06523646013336669, validate_acc = 0.958125
epoch 7: max_validate_acc = 0.9584375, train_acc = 0.955078125, tra

KeyboardInterrupt: 

In [None]:
print(f'best result at epoch {best_epoch + 1}: learning_rate={learning_rate}, neuron={all_layer_neuron}, validate_acc={max_validate_acc/validate_size}, train_acc={max_train_acc/train_size}')

best result at epoch 16: learning_rate=0.002, neuron=[784, 36, 4], validate_acc=0.9621875, train_acc=0.97546875


In [None]:
pd_test_origin = pd.read_csv('data/lab3_test.csv')
pd_test_origin = pd_test_origin / 255
pd_test_origin = np.array(pd_test_origin)
pd_test_origin

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
ans = []
for i, feature_data in enumerate(pd_test_origin):
    a = {} # output of layers
    error = {} # error of layers
    a[0] = feature_data

    # Forward
    for layer, neuron in enumerate(layer_neuron):
        output = model.forward(a[layer], best_w[layer+1], best_b[layer+1])            
        a[layer+1] = np.array(output.reshape(1,-1))[0]

    arr = a[len(layer_neuron)]

    for i, data in enumerate(arr):
        if data == np.max(arr):
            if i == 0:
                ans.append(0)
            elif i == 1:
                ans.append(3)
            elif i == 2:
                ans.append(8)
            elif i == 3:
                ans.append(9)      

In [None]:
test_ans = pd.DataFrame(ans)
test_ans = test_ans.rename({0:'ans'},axis=1)
test_ans.to_csv('test_ans.csv', index=None)
test_ans

Unnamed: 0,ans
0,9
1,0
2,9
3,8
4,8
...,...
3995,3
3996,0
3997,9
3998,0
