In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [2]:
# 读入文件
train_data_df = pd.read_csv("../data/traindata.csv")
train_label_df = pd.read_csv("../data/trainlabel.txt")
train_data_df = train_data_df.drop(['fnlwgt'], axis=1)
features = train_data_df.columns

# 补充缺失值
most_common_value = train_data_df['workclass'].mode().values[0]
train_data_df['workclass'] = train_data_df['workclass'].replace('?', most_common_value)

most_common_value = train_data_df['occupation'].mode().values[0]
train_data_df['occupation'] = train_data_df['occupation'].replace('?', most_common_value)

most_common_value = train_data_df['native.country'].mode().values[0]
train_data_df['native.country'] = train_data_df['native.country'].replace('?', most_common_value)

# 缺失值补充
train_data_df.replace('Preschool', 1, inplace=True)
train_data_df.replace('1st-4th', 2, inplace=True)
train_data_df.replace('5th-6th', 3, inplace=True)
train_data_df.replace('7th-8th', 4, inplace=True)
train_data_df.replace('9th', 5, inplace=True)
train_data_df.replace('10th', 6, inplace=True)
train_data_df.replace('11th', 7, inplace=True)
train_data_df.replace('12th', 8, inplace=True)
train_data_df.replace('HS-grad', 9, inplace=True)
train_data_df.replace('Some-college', 10, inplace=True)
train_data_df.replace('Assoc-voc', 11, inplace=True)
train_data_df.replace('Assoc-acdm', 12, inplace=True)
train_data_df.replace('Bachelors', 13, inplace=True)
train_data_df.replace('Prof-school', 14, inplace=True)
train_data_df.replace('Masters', 15, inplace=True)
train_data_df.replace('Doctorate', 16, inplace=True)

# 对非数值信息独热编码
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(train_data_df[['workclass', 'marital.status','occupation', 'relationship', 'race', 'sex', 'native.country']]).toarray()
encoded_features = pd.concat([pd.DataFrame(encoded_features), train_data_df[['age', 'education', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']]], axis=1)

# 数据归一化
from sklearn.preprocessing import StandardScaler

encoded_features.columns = encoded_features.columns.astype(str)

# 使用StandardScaler进行Z-Score归一化
zscore_scaler = StandardScaler()
df_zscore_scaled = zscore_scaler.fit_transform(encoded_features)

encoded_features = pd.DataFrame(df_zscore_scaled)

# 相关性分析
correlation_matrix = encoded_features.corr()

# 获取独热编码后的特征名称
encoded_feature_names = encoder.get_feature_names_out(['workclass', 'marital.status','occupation', 'relationship', 'race', 'sex', 'native.country'])
arr = np.array(['age', 'education', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week'])
encoded_feature_names = np.append(encoded_feature_names, arr)
for i in range(len(encoded_feature_names)):
    print(i, encoded_feature_names[i])
for i in range(len(correlation_matrix)):
    for j in range(len(correlation_matrix)):
        if i >= j or i == 44: continue
        e = correlation_matrix[i][j]
        if e > 0.4 or e < -0.4: 
            print(i, encoded_feature_names[i], j, encoded_feature_names[j], e)
            if i in encoded_features.columns: encoded_features = encoded_features.drop(i, axis=1)
                
encoded_features

0 workclass_Federal-gov
1 workclass_Local-gov
2 workclass_Never-worked
3 workclass_Private
4 workclass_Self-emp-inc
5 workclass_Self-emp-not-inc
6 workclass_State-gov
7 workclass_Without-pay
8 marital.status_Divorced
9 marital.status_Married-AF-spouse
10 marital.status_Married-civ-spouse
11 marital.status_Married-spouse-absent
12 marital.status_Never-married
13 marital.status_Separated
14 marital.status_Widowed
15 occupation_Adm-clerical
16 occupation_Armed-Forces
17 occupation_Craft-repair
18 occupation_Exec-managerial
19 occupation_Farming-fishing
20 occupation_Handlers-cleaners
21 occupation_Machine-op-inspct
22 occupation_Other-service
23 occupation_Priv-house-serv
24 occupation_Prof-specialty
25 occupation_Protective-serv
26 occupation_Sales
27 occupation_Tech-support
28 occupation_Transport-moving
29 relationship_Husband
30 relationship_Not-in-family
31 relationship_Other-relative
32 relationship_Own-child
33 relationship_Unmarried
34 relationship_Wife
35 race_Amer-Indian-Eskimo


Unnamed: 0,0,2,4,5,6,7,8,9,11,13,...,77,78,79,80,81,82,84,85,86,87
0,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,2.821506,1.131681,0.38330,-0.214689,-2.148673
1,-0.175098,-0.014813,5.270463,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,5.474248,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,0.108878,1.131681,-0.14506,-0.214689,0.775212
2,-0.175098,-0.014813,-0.189737,3.484849,-0.206136,-0.018738,-0.396990,-0.027321,8.793351,-0.182673,...,-0.025662,-0.025662,-3.268366,-0.045458,-0.022952,-0.697579,-0.416186,-0.14506,3.690793,0.775212
3,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,5.474248,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,0.182192,-0.416186,-0.14506,-0.214689,-0.036978
4,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,-1.210779,-0.029219,-0.14506,-0.214689,-1.255264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22787,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,0.695392,-1.190120,-0.14506,-0.214689,0.775212
22788,-0.175098,-0.014813,-0.189737,3.484849,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,-0.917522,-0.416186,-0.14506,-0.214689,-0.036978
22789,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,-0.396990,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,-0.184379,-0.416186,-0.14506,-0.214689,-0.036978
22790,-0.175098,-0.014813,-0.189737,-0.286956,-0.206136,-0.018738,2.518958,-0.027321,-0.113722,-0.182673,...,-0.025662,-0.025662,0.305963,-0.045458,-0.022952,-0.404322,-0.029219,-0.14506,-0.214689,-0.036978


In [3]:
from sklearn.model_selection import cross_val_score

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(encoded_features, train_label_df, test_size=0.2, random_state=6)
y_train = y_train['label'].values.ravel()
y_test = y_test['label'].values.ravel()

train_label_df = train_label_df['label'].values.ravel()

def evaluate(model):

    # 执行交叉验证
    scores = cross_val_score(model, encoded_features, train_label_df, cv=5)  # cv=5 表示将数据集划分为5个折叠

    # 打印每次交叉验证的得分
    print("Cross-validation scores:", scores)

    # 打印平均得分
    print("Average score:", scores.mean())

In [4]:
# 创建一个 SVM 分类器
clf = svm.SVC(kernel='rbf')
evaluate(clf)

Cross-validation scores: [0.84163194 0.83790305 0.85344449 0.849276   0.85103115]
Average score: 0.8466573262402044


In [5]:
# 设置参数范围
param_grid = {'C': [0.001, 0.1, 1, 10, 1000]}

# 创建 GridSearchCV 对象
grid_search = GridSearchCV(estimator=svm.SVC(kernel='rbf'), 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='accuracy')

# 执行参数搜索
grid_search.fit(X_train, y_train)

# 打印所有参数组合的训练结果
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['params', 'mean_test_score']])

# 输出最优参数和对应的评估结果
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

         params  mean_test_score
0  {'C': 0.001}         0.759228
1    {'C': 0.1}         0.832940
2      {'C': 1}         0.845116
3     {'C': 10}         0.846542
4   {'C': 1000}         0.834421
Best parameters: {'C': 10}
Best score: 0.8465417890191759
