In [None]:
import pandas as pd

# 读取数据集文件
df = pd.read_csv('data_train.csv', header=0)

# print(X.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 统计正负样本数量和比例
# : 表示选择所有行，-1 表示选择最后一列
count = df.iloc[:,-1].value_counts()

count.plot(kind='pie', labels=['Negative', 'Positive'], autopct='%1.1f%%', shadow=True)
plt.title('Class Distribution')
plt.show()

In [None]:
# 绘制各个特征的直方图
for feature in df.columns:
    if feature == 'income':
        continue
    plt.hist(df[df['income'] == 0][feature], bins=20, alpha=0.5, label='Negative')
    plt.hist(df[df['income'] == 1][feature], bins=20,alpha=0.5, label='Positive')
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.xticks(rotation=90, fontsize=8)
    plt.show()


In [None]:
import seaborn as sns

# 计算相关系数
corr = df.corr()

# 绘制热力图
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# 创建 LabelEncoder 对象
encoder = LabelEncoder()

# 对 X_train 中的每一列进行 Label Encoding
for col in df:
    if df[col].dtype == 'object':
        df[col] = encoder.fit_transform(df[col].astype(str))

In [None]:
from sklearn.preprocessing import MinMaxScaler

# 创建MinMaxScaler对象
scaler = MinMaxScaler()

# 对fnlwgt列进行特征缩放
df['fnlwgt'] = scaler.fit_transform(df[['fnlwgt']])*100


df.head()

In [None]:
# 将数据集dataset前80%作为训练数据集，后20%为检验数据集
split_idx = int(len(df) * 0.8)

df_train = df[:split_idx]
df_valid = df[split_idx:]

print(df_train.head())
print(df_valid.head())

In [None]:
from sklearn.decomposition import PCA

X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]


# 实例化 PCA 模型，设置降维后的维度为 2
pca = PCA(n_components=2)

# 使用 PCA 模型对数据集进行降维
X_pca = pca.fit_transform(X)

# 将降维后的数据集和目标变量 y 合并为一个新的数据集
df_pca = pd.DataFrame(data=X_pca, columns=['PCA1', 'PCA2'])
df_pca['target'] = y

print(df_pca)

# 绘制降维后的数据集的散点图
fig, ax = plt.subplots()
ax.scatter(df_pca.loc[df_pca['target'] == 0, 'PCA1'], df_pca.loc[df_pca['target'] == 0, 'PCA2'], c='blue', label='Negative')
ax.scatter(df_pca.loc[df_pca['target'] == 1, 'PCA1'], df_pca.loc[df_pca['target'] == 1, 'PCA2'], c='red', label='Positive')
ax.legend()
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.show()


In [None]:

from sklearn.linear_model import LogisticRegression



X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1] 

# 实例化分类器，并设置相应的超参数
clf = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=0)

# 使用L1正则化训练模型并选择最佳特征子集
clf.fit(X_train, y_train)

best_features = []
for i in range(len(X_train.columns)):
    if clf.coef_[0, i] != 0:
        best_features.append(X_train.columns[i]) 


# 打印最佳特征子集
print(best_features)


In [None]:
from keras.models import Sequential
from keras.layers import Dense

X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1] 
X_valid = df_valid.iloc[:, :-1]
y_valid = df_valid.iloc[:, -1]

df_test = pd.read_csv('data_test.csv', header=0)



# 对 X_train 中的每一列进行 Label Encoding
for col in df_test:
    if df_test[col].dtype == 'object':
        df_test[col] = encoder.fit_transform(df_test[col].astype(str))



# 对fnlwgt列进行特征缩放
df_test['fnlwgt'] = scaler.fit_transform(df_test[['fnlwgt']])*100


X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]


# 构建神经网络模型
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 训练模型
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_valid, y_valid))

# 评估模型
test_loss, test_acc = model.evaluate(X_valid, y_valid)

print('Test accuracy:', test_acc)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

y_pred = model.predict(X_valid)
y_pred = np.round(y_pred).astype(int)  

print(y_pred)

# 计算 precision、recall 和 F1-score
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)


print(precision, recall, f1)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score


X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1] 
X_valid = df_valid.iloc[:, :-1]
y_valid = df_valid.iloc[:, -1]

df_test = pd.read_csv('data_test.csv', header=0)



# 对 X_train 中的每一列进行 Label Encoding
for col in df_test:
    if df_test[col].dtype == 'object':
        df_test[col] = encoder.fit_transform(df_test[col].astype(str))



# 对fnlwgt列进行特征缩放
df_test['fnlwgt'] = scaler.fit_transform(df_test[['fnlwgt']])*100


X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]


# 将 Pandas DataFrame 转换为 NumPy 数组
X_train = np.array(X_train)
y_train = np.array(y_train)
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)
X_test = np.array(X_test)
y_test = np.array(y_test)


# 将 NumPy 数组转换为 PyTorch Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
y_valid = torch.tensor(y_valid, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# 将 NumPy 数组转换为 PyTorch Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
y_valid = torch.tensor(y_valid, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)


# 定义神经网络模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(in_features=X_train.shape[1], out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# 初始化模型和优化器
net = Net()
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

# 训练模型
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, X_train.shape[0], batch_size):
        # 将数据转换为张量
        inputs = torch.Tensor(X_train[i:i+batch_size])
        labels = torch.Tensor(y_train[i:i+batch_size]).unsqueeze(1)

        # 前向传播、计算损失和反向传播
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # 统计损失
        running_loss += loss.item()

    # 在验证集上进行验证
    net.eval()
    with torch.no_grad():
        inputs = torch.Tensor(X_valid)
        labels = torch.Tensor(y_valid).unsqueeze(1)
        outputs = net(inputs)
        val_loss = criterion(outputs, labels)
        val_preds = outputs.round().squeeze().detach().numpy()
        val_labels = labels.squeeze().detach().numpy()
        val_accuracy = np.mean(val_preds == val_labels)
        val_precision = precision_score(val_labels, val_preds)
        val_recall = recall_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds)

    print(f'Epoch {epoch+1}, Training Loss: {running_loss / (X_train.shape[0] / batch_size):.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1-Score: {val_f1:.4f}')




In [None]:
df_test

In [None]:
net.eval()
with torch.no_grad():
    inputs = torch.Tensor(X_test)
    print(y_test.shape)
    labels = torch.Tensor(y_test).unsqueeze(1)
    outputs = net(inputs)
    print(labels.shape, outputs.shape)
    labels = torch.round(labels)
    print(labels)
    test_loss = criterion(outputs, labels)
    test_preds = outputs.round().squeeze().detach().numpy()
    test_labels = labels.squeeze().detach().numpy()
    test_accuracy = np.mean(test_preds == test_labels)
    test_precision = precision_score(test_labels, test_preds)
    test_recall = recall_score(test_labels, test_preds)
    test_f1 = f1_score(test_labels, test_preds)
    
print('Precision: {:.4f}'.format(test_precision))
print('Recall: {:.4f}'.format(test_recall))
print('F1-score: {:.4f}'.format(test_f1))
