In [118]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
import os
import argparse

train_path = "./data/train.csv"
test_path = "./data/test.csv"

In [119]:
attr = ["age", "workclass", "fnlwgt", "education", "education-num",\
       "marital-status", "occupation", "relationship", "race", "sex",\
       "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
con_attr = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
bool_attr = ["sex"]
cat_attr = list(set(attr) - set(con_attr) - set(bool_attr))

In [120]:
train_raw_data = pd.read_csv(train_path, sep=",", names=attr)
test_raw_data = pd.read_csv(test_path, sep=",", names=attr)

In [121]:
# 数据预处理
def dataProcess_X(rawData):
    # income不在该函数处理
    # sex由于只有两个属性，可以视作bool型0-1变量
    if "income" in rawData.columns:
        Data = rawData.drop(["sex", 'income'], axis=1)
    else:
        Data = rawData.drop(["sex"], axis=1)
    # 读取非连续型数据
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"]
    # 读取连续型数据
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn]
    
    ObjectData = Data[listObjectColumn]
    NonObjectData = Data[listNonObjedtColumn]
    
    # 对连续型数据normalize 标准化
    NonObjectData = (NonObjectData - NonObjectData.mean()) / NonObjectData.std()
    
    # 讲sex数据插入
    NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
    # one-hot编码 将非连续数据中的“？”作为一个独立的属性
    ObjectData = pd.get_dummies(ObjectData)
    # 合并
    Data = pd.concat([NonObjectData, ObjectData], axis=1)

    return Data

def dataProcess_Y(rawData):
    df_y = rawData['income']
    Data_y = pd.DataFrame((df_y ==' >50K').astype("int64"), columns=["income"])
    return Data_y

In [122]:
def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, (1-(1e-8)))

# X and Y are np.array
def _shuffle(X, Y):
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))

    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[: valid_size], Y[: valid_size]
    X_train, Y_train = X[valid_size :], Y[valid_size :]

    return X_train, Y_train, X_valid, Y_valid

def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
    sigma_inv = inv(shared_sigma)
    w = np.dot((mu1-mu2), sigma_inv)
    X_t = X.T
    b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
    a = np.dot(w,X_t) + b
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('generation model 准确率 = %f' % (float(result.sum()) / result.shape[0]))
    return

def train(X_train, Y_train):
    # vaild_set_percetange = 0.1
    # X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)

    #Gussian distribution parameters
    train_data_size = X_train.shape[0]
    attr_nums = X_train.shape[1]
    
    cnt1 = 0
    cnt2 = 0

    mu1 = np.zeros((attr_nums,))
    mu2 = np.zeros((attr_nums,))
    for i in range(train_data_size):
        if Y_train[i] == 1:     # >50k
            mu1 += X_train[i]
            cnt1 += 1
        else:
            mu2 += X_train[i]
            cnt2 += 1
    mu1 /= cnt1
    mu2 /= cnt2

    sigma1 = np.zeros((attr_nums, attr_nums))
    sigma2 = np.zeros((attr_nums, attr_nums))
    for i in range(train_data_size):
        if Y_train[i] == 1:
            sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
        else:
            sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])

    sigma1 /= cnt1
    sigma2 /= cnt2
    shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2

    N1 = cnt1
    N2 = cnt2

    return mu1, mu2, shared_sigma, N1, N2

In [123]:
x_train = dataProcess_X(train_raw_data).drop(['native-country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(test_raw_data).values
y_train = dataProcess_Y(train_raw_data).values
y_test = dataProcess_Y(test_raw_data).values

In [124]:
# 讲训练集分割初训练和验证集合 90%train 10%valid
vaild_set_percetange = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)

In [125]:
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
print("在训练集的验证集上验证结果：")
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)
mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)
sigma_inv = inv(shared_sigma)
w = np.dot((mu1 - mu2), sigma_inv)
X_t = x_test.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
    float(N1) / N2)
a = np.dot(w, X_t) + b
y = sigmoid(a)
y_ = np.around(y).astype(np.int)
df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
result = (np.squeeze(y_test) == y_)
print("在测试集上验证结果：")
print('测试集准确率 = %f' % (float(result.sum()) / result.shape[0]))

在训练集的验证集上验证结果：
generation model 准确率 = 0.240786
在测试集上验证结果：
测试集准确率 = 0.763774


  


In [126]:
np.sum(y_test)

3846