## 导入库

In [29]:
import pandas as pd
import numpy as np
import sklearn.utils
import time
from sklearn.preprocessing import OneHotEncoder #独热编码
from sklearn.model_selection import train_test_split #用于分割数据集
from sklearn import svm #支持向量机
from sklearn.svm import SVC #平衡样本
from sklearn.metrics import classification_report # 报告训练结果
from sklearn.model_selection import GridSearchCV #用于搜索最优参数

## 导入数据

In [2]:
salary_df = pd.read_csv(r'.\salary.csv')
salary_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## 数据预处理

### 探索性处理

In [3]:
salary_df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
#查看数据缺失情况
salary_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [8]:
# 空值处理
salary_df=salary_df.replace(regex=[r'\?|\.|\$'],value=np.nan) #正则匹配，标记缺失
salary_df.isnull().sum() #计算每个变量的缺失个数

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [10]:
salary_df.dropna(how='any',inplace=True) #采用删除
salary_df.shape #查看数据维度

(30162, 15)

### 数据编码

In [11]:
#对类别属性转one_hot编码
for feature in salary_df.columns:
    if salary_df[feature].dtype == 'object':
        salary_df[feature] = pd.Categorical(salary_df[feature]).codes
salary_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [14]:
salary_df["salary"].replace([0,1],[-1,1],inplace = True) #对lable编码

### 对部分变量进行Onehot编码

In [17]:
salary_df.education.value_counts()

11    9840
15    6678
9     5044
12    1627
8     1307
1     1048
7     1008
0      820
5      557
14     542
6      455
2      377
10     375
4      288
3      151
13      45
Name: education, dtype: int64

In [18]:
salary_df = pd.concat([salary_df.drop(columns=['education']),pd.get_dummies(salary_df[['education']])],axis=1)

## 划分数据集

In [19]:
#分割训练集、验证集和测试集的划分,使用随机种子分割比例为5:3:2
x_train,x_validate_test,y_trian,y_validate_test = train_test_split(salary_df.drop(columns=['salary']),salary_df[['salary']],test_size = 0.5,train_size = 0.5,random_state=1) #先把训练集分出来
x_validate,x_test,y_validate,y_test = train_test_split(x_validate_test,y_validate_test,test_size = 0.4,train_size = 0.6,random_state=1) #再分验证集和测试集
print('训练集样本量：%i'%len(x_train))
print('验证集样本量：%i'%len(x_validate))
print('测试集样本量：%i'%len(x_test))

训练集样本量：15081
验证集样本量：9048
测试集样本量：6033


## 训练模型

In [20]:
model = svm.SVC(class_weight='balanced')  #自动调整不平衡样本
model.fit(x_train, y_trian)

  y = column_or_1d(y, warn=True)


SVC(class_weight='balanced')

In [22]:
#训练结果
classification_report(y_validate, model.predict(x_validate))

              precision    recall  f1-score   support

          -1       0.79      0.99      0.88      6798
           1       0.88      0.19      0.32      2250

    accuracy                           0.79      9048
   macro avg       0.84      0.59      0.60      9048
weighted avg       0.81      0.79      0.74      9048



## 调参

In [None]:
parameters={'kernel':['linear','rbf','sigmoid','poly'],'C':np.linspace(0.1,20,10),'gamma':np.linspace(0.1,20,10)} #设置内核，C和GAMMA参数调整进行搜索
svc = svm.SVC()
model = GridSearchCV(svc,parameters,cv=5,scoring='accuracy')
model.fit(x_train,y_trian)
model.best_params_ #输出最佳参数
model.score(x_test,y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## 输出测试集合准确度

In [26]:
print(classification_report(y_test,model.predict(x_test)))

              precision    recall  f1-score   support

          -1       0.78      0.99      0.87      4498
           1       0.89      0.17      0.28      1535

    accuracy                           0.78      6033
   macro avg       0.83      0.58      0.58      6033
weighted avg       0.81      0.78      0.72      6033

