# 加载数据集

In [None]:
import numpy as np
from linear_model import load_binary_dataset

In [None]:
train_data, train_label, dev_data, dev_label = load_binary_dataset()

print("train data:  " + str(train_data.shape))
print("train label: " + str(train_label.shape))
print("test data:   " + str(dev_data.shape))
print("test label:  " + str(dev_label.shape))

# Logistic Regression

In [None]:
#先取 100 个出来测试代码
train_data = train_data[0:1000]
train_label = train_label[0:1000]

print("train data:  " + str(train_data.shape))
print("train label: " + str(train_label.shape))

In [None]:
def sigmoid(z):
    """
    计算输入的向量的 sigmoid 函数结果
    """
    return 1 / (1 + np.exp(-z))

In [None]:
def normalization(X, train = True, X_mean = None, X_std = None):
    """
    X_mean 为 None 时表示传入的是训练集，进行归一化并返回 X_mean 和 X_std
    X_mean 不为 None 时表示传入的是测试集，进行归一化
    """
    if(train):
        flag = 0
        X_mean = np.mean(X, 0).reshape(1, -1)
        X_std  = np.std(X, 0).reshape(1,-1)
        
    X = (X - X_mean) / (X_std + 1e-8)
    
    if(train):
        return X, X_mean, X_std
    else:
        return X

In [None]:
def hypothese(X, parameters):
    return sigmoid(np.dot(X, parameters))

In [None]:
def cal_gred(X, sub):
    gred = np.dot(X.T, sub)
    return gred

In [None]:
def cal_loss(y, h):
    loss = - np.dot(y.T, np.log(h + 1e-8)) - np.dot((1-y).T, np.log(1 - h + 1e-8))
    return loss

In [None]:
def logistic_regression_train(X, y, parameters, iteration = 10000, learning_rate = 0.0003):
    m, n = X.shape
    loss_history = np.zeros((iteration,1))
    
    for ite in range(iteration):
        h = hypothese(X, parameters)
        sub = h - y
        gred = cal_gred(X, sub)
        loss_history[ite] = cal_loss(y, h)
        if(ite % 200 == 0):
            print("iteration "+ str(ite) +" loss: " +  str(loss_history[ite]))
        parameters = parameters - learning_rate * gred
        
    return parameters

In [None]:
def predict(h):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return np.round(h).astype(np.int)
    
def accuracy(Y_pred, Y_label):
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    return acc

In [None]:
train_data, train_mean, train_std = normalization(train_data)
#dev_data = normalization(dev_data, False, train_mean, train_std)
m, n = train_data.shape

parameters = np.zeros((n, 1))
parameters = logistic_regression_train(train_data, train_label, parameters, iteration = 10000, learning_rate = 0.0003)

In [None]:
h = hypothese(train_data, parameters)
p = predict(h)
acc = accuracy(p, train_label)

In [None]:
acc