In [18]:
import pandas as pd
import numpy as np
import sys

In [19]:
def oneline_log(text):
    sys.stdout.write('\r')
    sys.stdout.write(text)

In [20]:
class LogisticRegression():
    def __init__(self):
        super(LogisticRegression, self).__init__()

    def linear(self, x, w, b):

        return w * x.T + b

    def sigmoid(self, x):

        x = np.clip(x, -709.78, 709.78)

        return 1 / (1 + np.exp(-x))

    def forward(self, x, w, b):
        x = np.mat(x)
        w = np.mat(w)
        # print(f'x shape: {np.shape(x)}')
        # print(f'w shape: {np.shape(w)}')
        net_input = self.linear(x, w, b)
        # print(f'net_input: {net_input}')
        y_estimate = self.sigmoid(net_input)

        return y_estimate


model = LogisticRegression()

In [21]:
# CrossEntropy
class BinaryCrossEntropy():
    def __init__(self):
        super(BinaryCrossEntropy, self).__init__()
    
    def cross_entropy(self, y_pred, target):
        x = target*np.log(y_pred) + (1-target)*np.log(1-y_pred)

        return -(np.mean(x))

    def forward(self, y_pred, target):

        return self.cross_entropy(y_pred, target)
criterion = BinaryCrossEntropy()

In [22]:
# Error Function: Cross-entropy loss
# used to calculate the loss of estimate
# a: estimate value of y
# y: true value of y
def calculate_cross_entropy(y, a):
    return -np.nan_to_num(np.multiply(y, np.log(a)) + np.multiply((1-y), np.log(1-a))).mean()

In [23]:
# generate random weight for each layer
# number of weight = input * neuron of next layer

# return value is 2D array of weight w_ji 
# w_ji means for the j neuron, the weight of input i
random_scalar = 100
def generate_layer_weight(seed, neuron, input):
    np.random.seed(seed) # set seed for weight random
    # w_ji 其中 j 对应 neuron, i 对应 input，所以 reshape 也同样按照如此进行
    weight = np.random.randn(neuron,input) / random_scalar
    # weight = np.zeros((neuron,input))
    return weight

In [24]:
# generate random bias for each layer
# number of bias = neuron of layer

# return value is vector of bias
def generate_layer_bias(seed, neuron):
    np.random.seed(seed) # set seed for bias random
    bias = np.random.randn(neuron, 1) / random_scalar
    # bias = np.zeros((neuron,1))
    return bias

In [25]:
def one_hot(value):
    if value == 9:
        return np.array([0,0,0,1])
    elif value == 8:
        return np.array([0,0,1,0])
    elif value == 3:
        return np.array([0,1,0,0])
    elif value == 0:
        return np.array([1,0,0,0])

In [26]:
# print('================== Start ==================')
pd_train_origin = pd.read_csv('data/lab3_train.csv')

In [27]:
# 保留 label 为 0、3、8、9 的 data
# 用 drop() 的方法，参见: https://www.cnblogs.com/everfight/p/pandas_condition_remove.html
pd_train_origin = pd_train_origin[(pd_train_origin.label == 0) 
                                | (pd_train_origin.label == 3) 
                                | (pd_train_origin.label == 8) 
                                | (pd_train_origin.label == 9) ]

In [28]:
pd_train = pd_train_origin.sample(frac=0.8, random_state=2)
pd_validate = pd_train_origin.drop(index=pd_train.index)

In [29]:
pd_validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 1 to 15994
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 19.2 MB


In [30]:
pd_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12800 entries, 464 to 13599
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 76.8 MB


In [31]:
train_feature = pd_train.drop(['label'], axis=1)
train_feature = train_feature / 255
train_target = pd.DataFrame(pd_train.label)
train_feature = np.array(train_feature)
train_target = np.array(train_target)

validate_feature = pd_validate.drop(['label'], axis=1)
validate_feature = validate_feature / 255
validate_target = pd.DataFrame(pd_validate.label)
validate_feature = np.array(validate_feature)
validate_target = np.array(validate_target)

In [32]:
# 一共 len(train_feature) 笔 data
# 每一笔 data 有 784 个 pixel

w = {} # weight of layers
b = {} # bias of layers
learning_rate = 0.001

In [33]:
# weight for hidden layers
layer_neuron = [12, 4]
for i, neuron in enumerate(layer_neuron):
    input_size = 784 if i == 0 else layer_neuron[i-1]
    w[i+1] = generate_layer_weight(seed=2, neuron=neuron, input=input_size)
    print(f'layer {i}, weight shape: {np.shape(w[i+1])}')
    print(w[i+1])
    b[i+1] = generate_layer_bias(seed=2, neuron=neuron)
    print(f'layer {i}, bias shape: {np.shape(b[i+1])}')

layer 0, weight shape: (12, 784)
[[-0.00416758 -0.00056267 -0.02136196 ... -0.00616844  0.00321336
  -0.00946447]
 [-0.00530139 -0.01259207  0.01677544 ... -0.00328425 -0.00562311
   0.00117914]
 [ 0.00738638 -0.01587296  0.001532   ... -0.00842856  0.01004047
   0.00054583]
 ...
 [-0.00545023 -0.00350528  0.01290577 ...  0.00489264  0.02207854
  -0.00779413]
 [ 0.00637325  0.00928663  0.00553019 ...  0.00933935 -0.01459427
   0.00129041]
 [ 0.00933665  0.00643812 -0.00766691 ...  0.01172525  0.00610519
  -0.0129887 ]]
layer 0, bias shape: (12, 1)
layer 1, weight shape: (4, 12)
[[-4.16757847e-03 -5.62668272e-04 -2.13619610e-02  1.64027081e-02
  -1.79343559e-02 -8.41747366e-03  5.02881417e-03 -1.24528809e-02
  -1.05795222e-02 -9.09007615e-03  5.51454045e-03  2.29220801e-02]
 [ 4.15393930e-04 -1.11792545e-02  5.39058321e-03 -5.96159700e-03
  -1.91304965e-04  1.17500122e-02 -7.47870949e-03  9.02525097e-05
  -8.78107893e-03 -1.56434170e-03  2.56570452e-03 -9.88779049e-03]
 [-3.38821966e-03

In [34]:
epoch_num = 1000
validate_size = len(validate_target)
train_size = len(train_target)
best_w = {} # weight of layers
best_b = {} # bias of layers
best_epoch = 0
best_train_loss = 0
best_validate_loss = 0
max_acc = 0
overfit_threshold = 0.015 # 如果 acc 比 max_acc 小 overfit_threshold 的话
is_overfit = False

for epoch in range(epoch_num):
    train_loss_sum = 0
    oneline_log(f'epoch {epoch + 1}')
    
    for i, feature_data in enumerate(train_feature):
        # 第 i 笔 data 的 feature
        a = {} # output of layers
        error = {} # error of layers
        a[0] = feature_data

        # Forward
        for layer, neuron in enumerate(layer_neuron):
            output = model.forward(a[layer], w[layer+1], b[layer+1])            
            a[layer+1] = np.array(output.reshape(1,-1))[0]

        # Backward
        y = one_hot(train_target[i][0])
        loss_estimate = a[len(layer_neuron)]
        train_loss_sum += criterion.forward(loss_estimate, y)
        
        error[len(layer_neuron)] = np.mat(loss_estimate - y).T

        for layer in range(len(layer_neuron) - 1, -1, -1):
            # print(f'layer: {layer}')
            left = np.mat(w[layer+1]).T
            right = error[layer+1] * np.dot( a[layer], 1-a[layer])
            error[layer] = np.dot(left , right)
            # print(f'error {layer}: {error[layer]}')

        # Update parameter
        for layer in range(1, len(layer_neuron)+1):
            dw = np.dot(error[layer] , np.mat(a[layer-1]))
            w[layer] -= learning_rate * dw
            b[layer] -= learning_rate * error[layer]

    train_loss = train_loss_sum / train_size

    if (epoch+1) % 1 == 0:
        acc_count = 0
        validate_loss_sum = 0
        
        for i, feature_data in enumerate(validate_feature):
            a = {} # output of layers
            error = {} # error of layers
            a[0] = feature_data

            # Forward
            for layer, neuron in enumerate(layer_neuron):
                output = model.forward(a[layer], w[layer+1], b[layer+1])            
                a[layer+1] = np.array(output.reshape(1,-1))[0]

            arr = a[len(layer_neuron)]

            y = one_hot(validate_target[i][0])
         
            for i, data in enumerate(y):
                if data == 1 and arr[i] == np.max(arr):
                    acc_count += 1
            # Backward
            loss_estimate = a[len(layer_neuron)]
            validate_loss_sum += criterion.forward(loss_estimate, y)
        
        validate_loss = validate_loss_sum / validate_size
        if(max_acc < acc_count):
            max_acc = acc_count
            best_b = b
            best_w = w
            best_epoch = epoch
            best_train_loss = train_loss
            best_validate_loss = validate_loss
        oneline_log('')
        print(f'epoch {epoch + 1}: train_loss = {train_loss}, validate_loss = {validate_loss}, max_acc = {max_acc/validate_size}, acc = {acc_count/validate_size}')
    
    if max_acc/validate_size - acc_count/validate_size > overfit_threshold and epoch > 100:
        is_overfit = True
        print('========== stop because overfitting ==========')
        break

epoch 1: train_loss = 0.5161905708996076, validate_loss = 0.42363786706345996, max_acc = 0.6925, acc = 0.6925
epoch 2: train_loss = 0.3536492805583085, validate_loss = 0.2904536367487461, max_acc = 0.85, acc = 0.85
epoch 3: train_loss = 0.2632947249952197, validate_loss = 0.23551573874595647, max_acc = 0.89875, acc = 0.89875
epoch 4: train_loss = 0.20203896356033976, validate_loss = 0.16842273891162104, max_acc = 0.90875, acc = 0.90875
epoch 5: train_loss = 0.15153446919180585, validate_loss = 0.1366400153021799, max_acc = 0.935, acc = 0.935
epoch 6: train_loss = 0.12832562262602015, validate_loss = 0.11787197619962758, max_acc = 0.9390625, acc = 0.9390625
epoch 7: train_loss = 0.1150746212638647, validate_loss = 0.10620711416570985, max_acc = 0.9390625, acc = 0.9384375
epoch 8: train_loss = 0.10603949515188102, validate_loss = 0.09853355514417433, max_acc = 0.9415625, acc = 0.9415625
epoch 9: train_loss = 0.09981405283076944, validate_loss = 0.09292449956071648, max_acc = 0.9421875, a

In [35]:
print(f'setting: layer={len(layer_neuron)}, neuron={layer_neuron}')
print(f'best result at epoch {best_epoch + 1}: train_loss = {best_train_loss}, validate_loss = {best_validate_loss}, acc = {max_acc/validate_size}')

setting: layer=2, neuron=[12, 4]
best result at epoch 292: train_loss = 0.046683675868383796, validate_loss = 0.06296122842971533, acc = 0.9634375


In [36]:
pd_test_origin = pd.read_csv('data/lab3_test.csv')
# pd_test_origin = pd_test_origin / 255
pd_test_origin = np.array(pd_test_origin)
pd_test_origin

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
ans = []
for i, feature_data in enumerate(pd_test_origin):
    a = {} # output of layers
    error = {} # error of layers
    a[0] = feature_data

    # Forward
    for layer, neuron in enumerate(layer_neuron):
        output = model.forward(a[layer], best_w[layer+1], best_b[layer+1])            
        a[layer+1] = np.array(output.reshape(1,-1))[0]

    arr = a[len(layer_neuron)]

    for i, data in enumerate(arr):
        if data == np.max(arr):
            if i == 0:
                ans.append(0)
            elif i == 1:
                ans.append(3)
            elif i == 2:
                ans.append(8)
            elif i == 3:
                ans.append(9)      

In [38]:
ans

[9,
 0,
 9,
 8,
 8,
 3,
 3,
 9,
 3,
 8,
 3,
 0,
 3,
 3,
 8,
 9,
 9,
 9,
 0,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 8,
 8,
 8,
 9,
 8,
 8,
 9,
 9,
 3,
 9,
 9,
 9,
 9,
 9,
 0,
 3,
 9,
 3,
 9,
 0,
 8,
 3,
 8,
 9,
 8,
 9,
 8,
 3,
 0,
 9,
 9,
 9,
 9,
 0,
 9,
 0,
 8,
 9,
 9,
 9,
 8,
 8,
 3,
 8,
 9,
 9,
 0,
 0,
 9,
 8,
 3,
 3,
 3,
 8,
 9,
 3,
 3,
 9,
 8,
 8,
 9,
 9,
 9,
 9,
 3,
 0,
 9,
 9,
 0,
 3,
 3,
 0,
 3,
 9,
 0,
 3,
 0,
 9,
 8,
 9,
 0,
 3,
 9,
 0,
 0,
 0,
 0,
 3,
 9,
 0,
 0,
 9,
 0,
 8,
 9,
 3,
 8,
 3,
 8,
 0,
 8,
 0,
 0,
 0,
 8,
 8,
 0,
 8,
 3,
 0,
 9,
 8,
 8,
 0,
 9,
 0,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 3,
 0,
 0,
 9,
 0,
 3,
 0,
 0,
 3,
 8,
 8,
 0,
 8,
 9,
 9,
 8,
 0,
 9,
 8,
 8,
 9,
 8,
 9,
 3,
 8,
 0,
 3,
 9,
 0,
 0,
 0,
 3,
 8,
 8,
 8,
 9,
 0,
 9,
 9,
 3,
 9,
 9,
 3,
 0,
 9,
 8,
 9,
 3,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 0,
 8,
 3,
 8,
 3,
 9,
 9,
 8,
 0,
 3,
 0,
 3,
 8,
 3,
 0,
 9,
 9,
 8,
 3,
 8,
 3,
 8,
 9,
 0,
 8,
 3,
 8,
 0,
 3,
 8,
 8,
 0,
 8,
 0,
 0,
 0,
 3,
 0,
 0,
 3,
 9,
 9,
 0,


In [39]:
test_ans = pd.DataFrame(ans)
test_ans = test_ans.rename({0:'ans'},axis=1)
test_ans.to_csv('test_ans.csv', index=None)