In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from numpy.linalg import inv
from scipy.stats import multivariate_normal


In [50]:
# 读取数据集
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

# 定义离散特征和连续特征
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
continuous_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# 对标签进行二分类
data['income'] = data['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# 处理离散特征（独热编码）转换为DataFrame并添加列名
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_train = encoder.fit_transform(data[categorical_features])
encoded_categorical_df_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out(categorical_features))

encoded_categorical_test = encoder.transform(data_test[categorical_features])
encoded_categorical_df_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out(categorical_features))

# 保留连续特征
continuous_df_train = data[continuous_features]
continuous_df_test = data_test[continuous_features]

# 合并独热编码后的离散特征和连续特征
X_train = pd.concat([encoded_categorical_df_train, continuous_df_train], axis=1)
X_test = pd.concat([encoded_categorical_df_test, continuous_df_test], axis=1)

# 目标标签
Y_train = data['income']

print(f"X_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (32561, 108)
Y_train shape: (32561,)
X_test shape: (16281, 108)


In [52]:
# 标准化特征
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.fit_transform(X_test)

# 打印标准化后的结果
print("标准化后的训练集前两行：")
print(X_train_standardized[:1])
print(X_test_standardized[:1])


标准化后的训练集前两行：
[[-0.2444502  -0.17429511 -0.26209736 -0.01466381 -1.5167923  -0.18838933
  -0.29093568  4.90769968 -0.02073999 -0.17175325 -0.19348662 -0.11609195
  -0.07201601 -0.10164955 -0.1422718  -0.12664495 -0.18406376 -0.21053433
   2.25399324 -0.11334387 -0.68994199 -0.23637391 -0.03960742 -0.13419553
  -0.53714425 -0.39750806 -0.02658695 -0.92284068 -0.11403678  1.43105786
  -0.1802846  -0.17735813 -0.24494366  2.76348874 -0.01662771 -0.37949517
  -0.37774555 -0.17745022 -0.20957797 -0.25595432 -0.33554133 -0.06780164
  -0.38166338 -0.14260848 -0.35531609 -0.17127887 -0.22710355 -0.82533335
   1.70899099 -0.17624972 -0.42934582 -0.34403232 -0.22492681 -0.09820087
  -0.18155194 -0.32576824 -0.09161163  0.4130197  -0.70307135  0.70307135
  -0.13502327 -0.02416321 -0.06107342 -0.0480488  -0.04260602 -0.05409379
  -0.04641598 -0.02933708 -0.05714946 -0.05264698 -0.02985682 -0.06500204
  -0.02985682 -0.04437806 -0.03678503 -0.00554189 -0.01998525 -0.02479131
  -0.01998525 -0.05550333

In [53]:
# 计算两类别的均值
class_0 = X_train_standardized[Y_train == 0]
class_1 = X_train_standardized[Y_train == 1]

mean_0 = np.mean(class_0, axis=0)
mean_1 = np.mean(class_1, axis=0)

# 计算共享的协方差
cov_0 = np.cov(class_0, rowvar=False)
cov_1 = np.cov(class_1, rowvar=False)
shared_cov = (cov_0 * len(class_0) + cov_1 * len(class_1)) / (len(class_0) + len(class_1))

print("类别0的均值：", mean_0)
print("类别1的均值：", mean_1)
print("共享的协方差矩阵：", shared_cov)


类别0的均值： [ 4.40412122e-02 -3.34380783e-02 -1.86365604e-02  4.65125155e-03
  4.42270069e-02 -7.85473567e-02 -1.69066863e-02 -8.35971909e-03
  6.57857035e-03  3.94452073e-02  4.83584333e-02  2.51694377e-02
  1.94551226e-02  2.58092287e-02  3.35238813e-02  3.14081380e-02
 -1.83055805e-03 -5.65889473e-03 -1.01649124e-01 -7.46643555e-02
  7.38851919e-02 -9.83174002e-02  1.25631802e-02 -8.72432984e-02
  3.57829609e-02  7.15233920e-02 -6.79296154e-03 -2.50452210e-01
  2.39537093e-02  1.79345115e-01  4.18942119e-02  3.62594384e-02
  4.42575982e-02  5.06797317e-02  2.84064170e-03  7.08258940e-03
 -1.21009571e-01  2.92439847e-02  4.91476014e-02  3.90862653e-02
  8.80548130e-02  2.09067952e-02 -1.04679367e-01 -1.58362779e-02
 -1.33411410e-02 -1.44721052e-02  1.20953468e-02 -2.25862461e-01
  1.06161438e-01  4.71490057e-02  1.28708860e-01  8.04569961e-02
 -6.94218414e-02  1.61756577e-02 -5.93793300e-03  5.01745683e-02
  1.79265943e-02 -4.79983096e-02  1.21639698e-01 -1.21639698e-01
 -1.71080108e-03 

In [56]:
# 计算模型参数 w 和 b
inv_shared_cov = inv(shared_cov)
w = np.dot(inv_shared_cov, (mean_0 - mean_1))
b = (
    -0.5 * np.dot(mean_0.T, np.dot(inv_shared_cov, mean_0))
    + 0.5 * np.dot(mean_1.T, np.dot(inv_shared_cov, mean_1))
    + np.log(float(len(class_0)) / len(class_1))
)

print("模型参数 w:", w)
print("模型参数 b:", b)

模型参数 w: [ 1.92116291e+10  1.30955301e+12  1.89864444e+12  1.22104905e+09
  3.55762575e+12  1.40846141e+12  2.07656118e+12  1.51457131e+12
  1.60493170e+11 -5.63728147e+11 -6.30202252e+11 -3.87060476e+11
 -2.42087998e+11 -3.39963136e+11 -4.71201556e+11 -4.21180526e+11
 -6.01574133e+11 -6.81205499e+11 -1.25257897e+12 -3.78133399e+11
 -1.57946903e+12 -7.56447468e+11 -1.33624658e+11 -4.45427936e+11
 -1.40860493e+12 -4.02931757e+12 -3.11860574e+11 -5.85019349e+12
 -1.32139443e+12 -5.51126826e+12 -2.04958703e+12 -2.01836305e+12
  3.95272705e+12  3.02273371e+12  1.57040234e+11  3.13380888e+12
  3.12297752e+12  1.62521450e+12  1.89660012e+12  2.26935504e+12
  2.84911667e+12  6.37597917e+11  3.14717352e+12  1.32038384e+12
  2.98042933e+12  1.57197277e+12  2.04024268e+12  9.49835462e+12
  8.43361208e+12  3.30730903e+12  7.01396476e+12  5.95182625e+12
  4.14227615e+12 -1.44987331e+12 -2.61999014e+12 -4.39023014e+12
 -1.35426504e+12 -5.25957252e+12 -7.21218936e+13 -7.21218936e+13
 -2.31250000e+00 

In [68]:
def stable_sigmoid(z):
    # 将 z 限制在 -500 到 500 之间，避免 exp 溢出
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

class GDA:
    def __init__(self, w, b):
        self.w = w
        self.b = b

    # 计算概率的函数
    def func(self, x):
        arr = np.empty((x.shape[0], 1), dtype=float)
        for i in range(x.shape[0]):
            # 计算线性组合 z = w^T * x + b
            z = np.dot(x[i], self.w) + self.b
            # 计算概率 using stable sigmoid function
            arr[i][0] = stable_sigmoid(z)
        return np.clip(arr, 1e-8, 1 - 1e-8)

    # 预测函数
    def predict(self, x):
        # 初始化预测结果数组，默认所有预测为1
        ans = np.ones((x.shape[0], 1), dtype=int)
        probabilities = self.func(x)
        # 对每个样本的概率进行判断
        for i in range(x.shape[0]):
            # 如果概率小于0.5，预测为类别0
            if probabilities[i] < 0.5:
                ans[i] = 0
        return ans

# 创建 GDA 对象并传入训练好的 w 和 b
gda_model = GDA(w=w, b=b)


In [72]:
# 使用标准化后的测试数据进行预测
test_predictions = gda_model.predict(X_test_standardized)

# 创建预测结果的 DataFrame
result = pd.DataFrame({
    "id": [f"id_{i+1}" for i in range(len(test_predictions))],  # ID 从 1 开始编号
    "label": test_predictions.flatten()  # 将预测结果展平为一维数组
})

# 将结果保存为 CSV 文件
result.to_csv('predict.csv', index=False)

print("预测结果已保存到文件 'predict.csv'")


预测结果已保存到文件 'predict.csv'
