In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split

In [2]:
def loadData(filePath):
    data = pd.read_csv(filePath)
    if 'label' in data.columns:
        X = data.values[:,1:]
        y = data.values[:,0]
        return X, y
    else:
        X = data.values[:,:]
        return X

In [3]:
class Logistic_Regression:
    def __init__(self):
        pass
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    def fit(self, X, y, epochs = 200, lr = 0.01):
        self.X = np.concatenate((X,np.ones((X.shape[0],1))), axis = 1).T
        self.w = np.zeros((self.X.shape[0], 1))
        self.y = y.reshape((y.shape[0], 1))
        for _ in range(epochs):
            # /self.X.shape[1]，算平均，防止梯度过大(相当于减小学习率)
            self.w = self.w + lr * (self.X * (self.y.T - self.sigmoid(self.w.T.dot(self.X)))).sum(axis = 1).reshape(self.w.shape[0],1) \
            / self.X.shape[1]
    def predict(self, X):
        X_h = np.concatenate((X,np.ones((X.shape[0],1))), axis = 1).T
        prob = self.sigmoid(self.w.T.dot(X_h)).flatten()
#         y_pred = np.array(list(map(lambda x : 1 if x >= 0.5 else 0, prob)), dtype = np.int32)
#         return y_pred
        # 返回属于正例的概率
        return prob

In [4]:
def norm(X):
    mu = X.mean(axis = 0)
    sigma = X.std(axis = 0)
    sigma[sigma==0] = 1
    N_X = (X - mu) / sigma
    return N_X

In [5]:
trainFilePath = '../mnist/train.csv'
X, y = loadData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
X_train, X_valid = norm(X_train), norm(X_valid)

In [6]:
# 构造10个分类器，每个分类器用于对某一个类别的二分类
# 即第i个分类器用于计算样本属于第i个类别和不属于第i个类别的概率
# 对于每个测试样本，取二分类概率最大的类别标签为测试样本的标签

models = [Logistic_Regression() for _ in range(10)]
bin_y_train = [np.array(list(map(lambda x : 1 if x == i else 0,y_train))) for i in range(10)]
start = time.time()
for i in range(10):
    models[i].fit(X_train, bin_y_train[i], epochs = 800, lr = 0.03)
    end = time.time()
    print('Model %d has been trained. Cost %fs.' % (i, end - start))
    start = end
    

Model 0 has been trained. Cost 50.312291s.
Model 1 has been trained. Cost 53.259468s.
Model 2 has been trained. Cost 51.851577s.
Model 3 has been trained. Cost 49.838359s.
Model 4 has been trained. Cost 51.501688s.
Model 5 has been trained. Cost 54.060918s.
Model 6 has been trained. Cost 50.219145s.
Model 7 has been trained. Cost 50.706951s.
Model 8 has been trained. Cost 49.714122s.
Model 9 has been trained. Cost 50.509861s.


In [7]:
y_prob = []
for i in range(10):
    y_prob.append(models[i].predict(X_valid))
y_prob = np.array(y_prob)
y_pred = y_prob.argmax(axis = 0)
acc = (y_pred==y_valid).sum() / y_pred.shape[0]
print('acc = %f.' % (acc))

# epochs = 200, lr = 0.01, acc = 0.645476
# epochs = 400, lr = 0.01, acc = 0.759048
# epochs = 200, lr = 0.05, acc = 0.804881
# epochs = 400, lr = 0.05, acc = 0.814167
# epochs = 800, lr = 0.03, acc = 0.886429

acc = 0.886429.


In [12]:
testFilePath = '../mnist/test.csv'
X_test = loadData(testFilePath)
X_test = norm(X_test)
y_prob = []
for i in range(10):
    y_prob.append(models[i].predict(X_test))
y_prob = np.array(y_prob)
y_pred = y_prob.argmax(axis = 0)
ans = pd.DataFrame({'ImageId' : np.arange(1, y_pred.shape[0] + 1), 'Label' : y_pred.astype(np.int32)})
ans.to_csv('./result_mnist.csv', index = False)
# # epochs = 800, lr = 0.03, kaggle准确率 88.221%