In [21]:
import pandas as pd
import numpy as np

In [22]:
# 读取census_income.csv
census_income = pd.read_csv('census_income.csv')
# 将数据集中的非数值类进行标签化
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in census_income.columns:
	if census_income[col].dtype == 'object':
		census_income[col] = labelencoder.fit_transform(census_income[col])

# 使用平均数填充缺失值
census_income = census_income.fillna(census_income.mean())


In [23]:
# 将数据集划分为X和y
X = census_income.iloc[:, :-1]
y = census_income.iloc[:, -1]

# 1. 将数据集划分为训练集和测试集，其中测试集占总数据集的30%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
													test_size=.3,
													random_state=42)

In [24]:
# 使用SMOTE处理类别不平衡情况
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [27]:
# 使用catboost进行分类
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(verbose=300, random_state=42, n_estimators=3000)
# 设置 one_hot_max_size 参数
catboost.set_params(one_hot_max_size=50)

catboost.fit(X_train_sm, y_train_sm)
y_pred = catboost.predict(X_test)

# 输出结果的精准度，召回率，F1值
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Learning rate set to 0.012716
0:	learn: 0.6816165	total: 5.87ms	remaining: 17.6s
300:	learn: 0.2801017	total: 1.82s	remaining: 16.3s
600:	learn: 0.2522251	total: 3.55s	remaining: 14.2s
900:	learn: 0.2336880	total: 5.27s	remaining: 12.3s
1200:	learn: 0.2194157	total: 6.99s	remaining: 10.5s
1500:	learn: 0.2084288	total: 8.7s	remaining: 8.69s
1800:	learn: 0.1994771	total: 10.3s	remaining: 6.89s
2100:	learn: 0.1916290	total: 12.1s	remaining: 5.16s
2400:	learn: 0.1844780	total: 13.7s	remaining: 3.42s
2700:	learn: 0.1781210	total: 15.4s	remaining: 1.7s
2999:	learn: 0.1722489	total: 17s	remaining: 0us
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      3696
           1       0.71      0.69      0.70      1189

    accuracy                           0.85      4885
   macro avg       0.80      0.80      0.80      4885
weighted avg       0.85      0.85      0.85      4885



In [26]:
# 输出每个属性的权重
feature_importance = pd.DataFrame({'feature': X_train.columns,
								   'importance': catboost.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
print(feature_importance)

           feature  importance
4    education-num   19.367470
3        education   19.125663
5   marital-status   14.018690
12  hours-per-week    9.784763
10    capital-gain    8.540939
0              age    5.988355
7     relationship    4.945408
6       occupation    4.823682
1        workclass    4.817747
2           fnlwgt    3.391524
9              sex    2.235695
11    capital-loss    1.830420
8             race    0.588597
13  native-country    0.541049
