# 加载数据集

In [6]:
import numpy as np
from linear_model import load_binary_dataset

In [None]:
train_data, train_label, dev_data, dev_label = load_binary_dataset()

print("train data:  " + str(train_data.shape))
print("train label: " + str(train_label.shape))
print("test data:   " + str(dev_data.shape))
print("test label:  " + str(dev_label.shape))

# Logistic Regression

In [None]:
#先取 100 个出来测试代码
train_data = train_data[0:1000]
train_label = train_label[0:1000]

print("train data:  " + str(train_data.shape))
print("train label: " + str(train_label.shape))

In [None]:
def sigmoid(z):
    """
    计算输入的向量的 sigmoid 函数结果
    """
    return 1 / (1 + np.exp(-z))

In [None]:
def normalization(X, train = True, X_mean = None, X_std = None):
    """
    X_mean 为 None 时表示传入的是训练集，进行归一化并返回 X_mean 和 X_std
    X_mean 不为 None 时表示传入的是测试集，进行归一化
    """
    if(train):
        flag = 0
        X_mean = np.mean(X, 0).reshape(1, -1)
        X_std  = np.std(X, 0).reshape(1,-1)
        
    X = (X - X_mean) / (X_std + 1e-8)
    
    if(train):
        return X, X_mean, X_std
    else:
        return X

In [None]:
def hypothese(X, parameters):
    return sigmoid(np.dot(X, parameters))

In [None]:
def cal_gred(X, sub):
    gred = np.dot(X.T, sub)
    return gred

In [None]:
def cal_loss(y, h):
    loss = - np.dot(y.T, np.log(h + 1e-8)) - np.dot((1-y).T, np.log(1 - h + 1e-8))
    return loss

In [None]:
def shuffle(X, Y):
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

In [None]:
def split_to_miniBatch(X, Y, batchSize = 64):
    X_shuffled, Y_shuffled = shuffle(X, Y)    #对样本进行随机打乱
    m, n = X.shape                            #获取样本个数
    completeBatchNum = int(np.ceil(m / batchSize)) #计算完整的 batch 个数
    miniBatchs = []
    
    for i in range(completeBatchNum):
        tempBatch_X = X[i * batchSize : (i + 1) * batchSize]
        tempBatch_Y = Y[i * batchSize : (i + 1) * batchSize]
        miniBatchs.append((tempBatch_X, tempBatch_Y))
    
    if(m % batchSize != 0):
        tempBatch_X = X[completeBatchNum * batchSize :]
        tempBatch_Y = Y[completeBatchNum * batchSize :]
        miniBatchs.append((tempBatch_X, tempBatch_Y))
    
    return miniBatchs

In [None]:
def logistic_regression_train(X, y, parameters, iteration = 1000, learning_rate = 0.0003, train_method = "batch", batchSize = 64):
    m, n = X.shape
    
    if(train_method == "miniBatch"):
        miniBatchs = split_to_miniBatch(X, y, batchSize)
        batchNum = len(miniBatchs)
    
    if (train_method == "miniBatch"):
        subIteration = batchNum
    elif (train_method == "stochastic"):
        subIteration = m
    else:
        subIteration = 1
    
    loss_history = np.zeros((iteration * subIteration,1))
    
    for ite in range(iteration):
        for subIte in range(subIteration):
            if (train_method == "miniBatch"):    #取出当前循环需要用于计算的 batch
                batch_X, batch_y = miniBatchs[subIte]
            elif (train_method == "stochastic"):
                batch_X, batch_y = (X[subIte, : ].reshape(1 , -1), y[subIte].reshape(-1 , 1))
            else:
                batch_X, batch_y = (X, y)
                    
            h = hypothese(batch_X, parameters)
            sub = h - batch_y
            gred = cal_gred(batch_X, sub)
            
            #X_h = hypothese(X, parameters)
            #loss_history[ite] = cal_loss(y, X_h)
            if((ite * subIteration + subIte) % 100 == 0):
                X_h = hypothese(X, parameters)
                loss_history[ite * subIteration + subIte] = cal_loss(y, X_h)
                print("iteration "+ str(ite * subIteration + subIte) +" loss: " +  str(loss_history[ite * subIteration + subIte]))
            parameters = parameters - learning_rate * gred
        
    return parameters

In [None]:
def predict(h):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return np.round(h).astype(np.int)

In [None]:
def accuracy(Y_pred, Y_label):
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    return acc

In [None]:
def add_col(X):
    m, n = X.shape
    ones = np.ones((m, 1))
    return np.hstack((ones, X))

In [None]:
train_data, train_mean, train_std = normalization(train_data)
#dev_data = normalization(dev_data, False, train_mean, train_std)
train_data = add_col(train_data)
m, n = train_data.shape

In [None]:
parameters = np.zeros((n, 1))

In [None]:
parameters = logistic_regression_train(train_data, train_label, parameters, iteration = 1000, learning_rate = 0.00001, train_method = "batch")

# 训练集上的正确率

In [None]:
h = hypothese(train_data, parameters)
p = predict(h)
acc = accuracy(p, train_label)
acc

# 验证集上的正确率

In [None]:
dev_data = normalization(dev_data, False, train_mean, train_std)

In [None]:
dev_data = add_col(dev_data)

In [None]:
h = hypothese(dev_data, parameters)
p = predict(h)
acc = accuracy(p, dev_label)
acc