In [2653]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2654]:
pd_train_origin = pd.read_csv('data/train.csv')
pd_train_origin.head(3)
# train set including label

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2655]:
# pd_test = pd.read_csv('data/test.csv')
# pd_test.head(3)
# test set is not including label

In [2656]:
pd_train, pd_validate = train_test_split(pd_train_origin, train_size=0.8, random_state=0)

In [2657]:
pd_train.head(3)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
1161,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2355,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1831,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2658]:
pd_train.count()

label       3200
pixel0      3200
pixel1      3200
pixel2      3200
pixel3      3200
            ... 
pixel779    3200
pixel780    3200
pixel781    3200
pixel782    3200
pixel783    3200
Length: 785, dtype: int64

In [2659]:
pd_validate.head(3)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
2230,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
668,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3616,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2660]:
pd_validate.count()

label       800
pixel0      800
pixel1      800
pixel2      800
pixel3      800
           ... 
pixel779    800
pixel780    800
pixel781    800
pixel782    800
pixel783    800
Length: 785, dtype: int64

In [2661]:
def separate_label(data):
    data = pd.DataFrame(data)
    X = data.drop(['label'], axis=1)
    y = pd.DataFrame(data.label)
    return X, y

In [2662]:
# Error Function: Cross-entropy loss
# used to calculate the loss of estimate
# a: estimation value of y
# y: true value of y
def cross_entropy(a, y):
    return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))


In [2663]:
# Activation Function: Sigmoid
# 1/(1+e^(-n)) used to calculate the estimate value y
def sigmoid(n):
    return 1 / (1 + 1 / np.exp(n))


In [2664]:
def calculate_mse(y, a):
    mse = np.square(np.subtract(y, a)).mean()
    return mse

In [2665]:
def generate_weight(seed):
    np.random.seed(seed) # set seed for weights random
    weights = np.random.randn(784,1) # in this case, we always need 784 random weights
    return np.matrix(weights)

In [2666]:
def generate_bias(seed):
    np.random.seed(seed) # set seed for bias random
    return np.random.rand()

In [2667]:
def train_logistic_regression(data, epoch, stop_error, error_function, learning_rate):
    train_X, train_y = separate_label(data)
    
    weight = generate_weight(seed=3)
    bias = generate_bias(seed=1)
    print('|  LIMIT EPOCH: {}\n|  STOP ERROR: {}\n|  ERROR FUNCTION: {}\n|  LEARNING RATE: {}'.format(epoch, stop_error, error_function, learning_rate))
    print('|  TRAIN DATA LENGTH: {}'.format(len(data)))
    print('|  PIXELS NUMBERS: {}'.format(len(train_X.columns)))
    
    print('\n\n======================== START TRAINING ========================\n\n')
    train_X = train_X / 255
    train_X = np.array(train_X)
    # print('max pixel value: {}'.format(train_X.max()))
    # print('min pixel value: {}'.format(train_X.min()))
    x = np.mat(train_X).T # matrix construct by all pictures, each pic contain the 784 pixels as a vector
    train_y = train_y['label'].map({5:1, 2:0}) # map 5 to 1 and 2 to 0 for binary classification
    y = np.mat(train_y)

    # print('train_y: {}'.format(train_y))
    # print('weight.T.shape: {}'.format(weight.T.shape))
    # print('x.shape: {}'.format(x.shape))
    # print('Initial, bias : {}'.format(bias))
    # print('Initial, weight 0: {}, bias : {}'.format(np.array(weight)[0][0], bias))
    # print('Initial, weight 100: {}, bias : {}'.format(np.array(weight)[100][0], bias))
    max_acc = 0.0      
    for current_epoch in range(epoch): # loop every row in data (read in every picture consist of 784 pixels)
        
        n = weight.T * x + bias
        a = sigmoid(n)
        # print('a.shape: {}'.format(a.shape))
        # print('y.shape: {}'.format(y.shape))
        mse = calculate_mse(y, a)
        if(mse < stop_error):
            break

        dw = (x * (a - y).T ) / len(data)
        db = np.ones((1,len(data))) * (a - y).T / len(data)
        db = np.array(db)[0][0]
        weight = weight - learning_rate * dw
        bias = (bias - learning_rate * db)
        acc = validate_logistic_regression(pd_validate, weight=weight, bias=bias)
        # train_acc = validate_logistic_regression(data, weight=weight, bias=bias)
        # print('|  CURRENT EPOCH: {}'.format(current_epoch + 1))
        # print('EPOCH {}, bias : {}'.format(current_epoch+1, bias))
        # print('EPOCH {}, weight 0: {}'.format(current_epoch+1, np.array(weight)[0][0]))
        # print('EPOCH {}, weight 100: {}'.format(current_epoch+1, np.array(weight)[100][0]))
        # print('EPOCH {}, mse: {}'.format(current_epoch+1, mse))
        # print('|  acc: {}'.format(acc))
        # print('|  train_acc: {}'.format(train_acc))
        if(acc > max_acc):
            max_acc  = acc
            max_acc_epoch = current_epoch
            max_acc_weight = weight
            max_acc_bias = bias

           
    print('\n\n======================== STOP TRAINING ========================\n\n')
    if max_acc > acc:
        print('|  STOP REASON: early-stop, exist best acc: {}, at epoch: {}'.format(max_acc, max_acc_epoch))
    elif mse < stop_error :
        print('|  STOP REASON: stop-error')
    else:
        print('|  STOP REASON: reach max epoch')
    print('|  STOP EPOCH: {}'.format(current_epoch))
    print('|  mse: {}'.format(mse))
    print('|  max_acc: {}'.format(max_acc))
    print('|  max_acc_epoch: {}'.format(max_acc_epoch))
    print('|  LIMIT EPOCH: {}\n|  STOP ERROR: {}\n|  ERROR FUNCTION: {}\n|  LEARNING RATE: {}'.format(epoch, stop_error, error_function, learning_rate))
    print('|  TRAIN DATA LENGTH: {}'.format(len(data)))
    return max_acc_weight, max_acc_bias, weight, bias

In [2668]:
max_acc_weight, max_acc_bias, weight, bias = train_logistic_regression(data=pd_train, epoch=50000, stop_error=0.005, error_function='mse', learning_rate=0.2)

|  LIMIT EPOCH: 50000
|  STOP ERROR: 0.005
|  ERROR FUNCTION: mse
|  LEARNING RATE: 0.2
|  TRAIN DATA LENGTH: 3200
|  PIXELS NUMBERS: 784








|  STOP REASON: early-stop, exist best acc: 98.25, at epoch: 5866
|  STOP EPOCH: 9215
|  mse: 0.0049998223273404595
|  max_acc: 98.25
|  max_acc_epoch: 5866
|  LIMIT EPOCH: 50000
|  STOP ERROR: 0.005
|  ERROR FUNCTION: mse
|  LEARNING RATE: 0.2
|  TRAIN DATA LENGTH: 3200


In [2669]:
def calculate_acc(real, estimate):
    true_count = 0
    for index in range(len(real)):
        if real[index] == estimate[index]:
            true_count +=1
    acc = np.round(true_count / len(real) * 100.0,decimals=4)
    return acc

In [2670]:
def validate_logistic_regression(data, weight, bias):
    validate_X, validate_y = separate_label(data)
    # print('|  VALIDATION DATA LENGTH: {}'.format(len(data)))
   
    
    # print('\n======================== START VALIDATING ========================\n')
    validate_X = validate_X / 255
    validate_X = np.array(validate_X)
    x = np.mat(validate_X).T # matrix construct by all pictures, each pic contain the 784 pixels as a vector
    validate_y = validate_y['label'].map({5:1, 2:0}) # map 5 to 1 and 2 to 0 for binary classification
    y = np.mat(validate_y)

    
    n = weight.T * x + bias
    estimate = sigmoid(n)
    estimate = np.round(estimate).astype(int)
    y = np.array(y)[0]
    estimate = np.array(estimate)[0]

    # print('y: {}'.format(y))
    # print('estimate: {}'.format(estimate))
    return calculate_acc(y, estimate=estimate)
    

In [2671]:
acc = validate_logistic_regression(pd_validate, weight=max_acc_weight, bias=max_acc_bias)

In [2677]:
print('|  bias: {}'.format(bias))
print('|  weight: {}'.format(weight))
print('正确率: {}%'.format(acc))


|  bias: 2.4682333049950143
|  weight: [[ 1.78862847e+00]
 [ 4.36509851e-01]
 [ 9.64974681e-02]
 [-1.86349270e+00]
 [-2.77388203e-01]
 [-3.54758979e-01]
 [-8.27414815e-02]
 [-6.27000677e-01]
 [-4.38181690e-02]
 [-4.77218030e-01]
 [-1.31386475e+00]
 [ 8.84622380e-01]
 [ 8.81315045e-01]
 [ 1.70950832e+00]
 [ 4.99689017e-02]
 [-4.04680112e-01]
 [-5.45359948e-01]
 [-1.54647732e+00]
 [ 9.82367434e-01]
 [-1.10106763e+00]
 [-1.18504653e+00]
 [-2.05649899e-01]
 [ 1.48614836e+00]
 [ 2.36716267e-01]
 [-1.02378514e+00]
 [-7.12993200e-01]
 [ 6.25244966e-01]
 [-1.60513363e-01]
 [-7.68836350e-01]
 [-2.30030722e-01]
 [ 7.45056266e-01]
 [ 1.97611078e+00]
 [-1.24412333e+00]
 [-6.26416911e-01]
 [-8.03766095e-01]
 [-2.41908317e+00]
 [-9.23792058e-01]
 [-1.02387580e+00]
 [ 1.12397793e+00]
 [-1.31914258e-01]
 [-1.62329131e+00]
 [ 6.46584188e-01]
 [-3.56369048e-01]
 [-1.74320308e+00]
 [-5.96779987e-01]
 [-5.88687582e-01]
 [-8.73892852e-01]
 [ 2.97138154e-02]
 [-2.24825777e+00]
 [-2.67761865e-01]
 [ 1.013183

In [2678]:
def ans_generate(data, weight, bias):
    validate_X = data
    validate_X = validate_X / 255
    validate_X = np.array(validate_X)
    x = np.mat(validate_X).T # matrix construct by all pictures, each pic contain the 784 pixels as a vector

    
    n = weight.T * x + bias
    estimate = sigmoid(n)
    estimate = np.round(estimate).astype(int)
    estimate = np.array(estimate)[0]
    ans = pd.DataFrame(estimate)

    return ans

In [2679]:
pd_test_origin = pd.read_csv('data/test.csv')
pd_test_origin

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2680]:
ans = ans_generate(pd_test_origin, weight=max_acc_weight, bias=max_acc_bias)
ans = pd.DataFrame(ans[0].map({1:5, 0:2})) # map 1 to 5 and 0 to 2 

ans = ans.rename({0:'ans'},axis=1)
ans

Unnamed: 0,ans
0,2
1,5
2,5
3,5
4,5
...,...
995,2
996,2
997,5
998,5


In [2681]:
ans.to_csv('test_ans.csv', index=None)