In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

class LogisticRegression:
    def __init__ (self, mode="BGD", max_iter=200, learning_rate=0.01, normalization=False):
        self.mode = mode
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.normalization = normalization
    
    def sigmoid (self, x):
        return 1 / (1 + np.exp(-x))

    def dataMatrix (self, X):
        data_mat = []
        for d in X:
            data_mat.append([1.0, *d])
        return np.array(data_mat)

    def gradient (self, x, y):
        h = self.sigmoid(np.dot(x, self.weights))
        error = h - y
        grad = self.learning_rate / len(x) * error * np.transpose([x])
        if (self.normalization is True):
            d = self.learning_rate / len(x) * self.weights
            d[0] = 0
            grad += d
        return grad

    # 批量梯度下降
    def BGD (self, X, y):
        for iter in range(self.max_iter):
            for x, v in zip(X, y):
                self.weights -= self.gradient(x, v)
    
    # 随机梯度下降，每一轮迭代都随机选择下标
    def SGD (self, X, y):
        for iter in range(self.max_iter):
            randIndex = int(np.random.uniform(0, len(X)))
            x, v = X[randIndex], y[randIndex]
            self.weights -= self.gradient(x, v)

    def fit (self, X, y):
        X = self.dataMatrix(X)
        self.weights = np.ones((len(X[0]), 1))
        if (self.mode == "BGD"):
            self.BGD(X, y)
        elif (self.mode == "SGD"):
            self.SGD(X, y)
    
    def predict (self, X):
        X = self.dataMatrix(X)
        return self.sigmoid(X.dot(self.weights))


In [8]:
train_data = pd.read_csv("./TrainData.csv")

X = train_data.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y = train_data[["h1n1_vaccine", "seasonal_vaccine"]]
X, y = np.array(X), np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
y1_train, y2_train = y_train[:, 0], y_train[:, 1]
y1_test, y2_test = y_test[:, 0], y_test[:, 1]

In [3]:
model_bgd_1 = LogisticRegression(mode="BGD", max_iter=30)
model_bgd_2 = LogisticRegression(mode="BGD", max_iter=30)
model_sgd_1 = LogisticRegression(mode="SGD", max_iter=500000)
model_sgd_2 = LogisticRegression(mode="SGD", max_iter=500000)
model_bgd_1.fit(X_train, y1_train)
y1_bgd_pre = model_bgd_1.predict(X_test)
model_bgd_2.fit(X_train, y2_train)
y2_bgd_pre = model_bgd_2.predict(X_test)
model_sgd_1.fit(X_train, y1_train)
y1_sgd_pre = model_sgd_1.predict(X_test)
model_sgd_2.fit(X_train, y2_train)
y2_sgd_pre = model_sgd_2.predict(X_test)

print("BGD:")
print("h1n1_vaccine:", roc_auc_score(y1_test, y1_bgd_pre))
print("seasonal_vaccine:", roc_auc_score(y2_test, y2_bgd_pre))
print("SGD:")
print("h1n1_vaccine:", roc_auc_score(y1_test, y1_sgd_pre))
print("seasonal_vaccine:", roc_auc_score(y2_test, y2_sgd_pre))

# 使用随机梯度下降后发现正确率有略微下降

BGD:
h1n1_vaccine: 0.8218116045424405
seasonal_vaccine: 0.8336868753295134
SGD:
h1n1_vaccine: 0.8130037014344107
seasonal_vaccine: 0.8249697325333953


In [4]:
model_bgd_n_1 = LogisticRegression(mode="BGD", max_iter=30, normalization=True)
model_bgd_n_2 = LogisticRegression(mode="BGD", max_iter=30, normalization=True)
model_sgd_n_1 = LogisticRegression(mode="SGD", max_iter=500000, normalization=True)
model_sgd_n_2 = LogisticRegression(mode="SGD", max_iter=500000, normalization=True)
model_bgd_n_1.fit(X_train, y1_train)
y1_bgd_pre = model_bgd_n_1.predict(X_test)
model_bgd_n_2.fit(X_train, y2_train)
y2_bgd_pre = model_bgd_n_2.predict(X_test)
model_sgd_n_1.fit(X_train, y1_train)
y1_sgd_pre = model_sgd_n_1.predict(X_test)
model_sgd_n_2.fit(X_train, y2_train)
y2_sgd_pre = model_sgd_n_2.predict(X_test)

print("BGD:")
print("h1n1_vaccine:", roc_auc_score(y1_test, y1_bgd_pre))
print("seasonal_vaccine:", roc_auc_score(y2_test, y2_bgd_pre))
print("SGD:")
print("h1n1_vaccine:", roc_auc_score(y1_test, y1_sgd_pre))
print("seasonal_vaccine:", roc_auc_score(y2_test, y2_sgd_pre))

# 使用正则化后发现正确率都有所下降，过拟合现象减轻

BGD:
h1n1_vaccine: 0.7090074512862492
seasonal_vaccine: 0.7323300003822364
SGD:
h1n1_vaccine: 0.729467871162362
seasonal_vaccine: 0.7664131165528577


In [5]:
test_features = pd.read_csv("./TestFeatures.csv")
id = np.array(test_features["respondent_id"])
X_features = np.array(test_features.drop(columns=["respondent_id"]))

y1_label = model_bgd_1.predict(X_features).reshape(1, -1)[0]
y2_label = model_bgd_2.predict(X_features).reshape(1, -1)[0]

output = pd.DataFrame({"respondent_id": id, "h1n1_vaccine": y1_label, "seasonal_vaccine": y2_label})
output.to_csv("./submission.csv", index=False)