# sklearn 机器学习 Pipeline 模板

1. 导入工具包

In [9]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

2. 读取数据

In [2]:
data = pd.read_csv("../../data/Employee_Satisfaction/train.csv")
test = pd.read_csv("../../data/Employee_Satisfaction/test.csv")
data.columns

Index(['id', 'last_evaluation', 'number_project', 'average_monthly_hours',
       'time_spend_company', 'Work_accident', 'package',
       'promotion_last_5years', 'division', 'salary', 'satisfaction_level'],
      dtype='object')

In [3]:
data.head()

Unnamed: 0,id,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,package,promotion_last_5years,division,salary,satisfaction_level
0,13697,0.99,3,161.39,2,0,a,0,accounting,medium,0.681
1,1142,1.0,5,226.22,6,0,b,0,marketing,low,0.876
2,7954,0.91,5,199.45,2,0,e,0,sales,medium,0.955
3,2225,0.51,3,235.14,3,0,c,0,sales,low,0.817
4,9753,0.89,3,219.91,2,0,a,0,technical,low,0.92


In [4]:
y = data['satisfaction_level']
X = data.drop(['satisfaction_level'], axis=1)

In [6]:
X.dtypes == 'object'

id                       False
last_evaluation          False
number_project           False
average_monthly_hours    False
time_spend_company       False
Work_accident            False
package                   True
promotion_last_5years    False
division                  True
salary                    True
dtype: bool

3. 数字特征、文字特征分离

In [8]:
def num_cat_splitor(X):
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)
    # object_cols # ['package', 'division', 'salary']
    num_cols = list(set(X.columns) - set(object_cols))
    # num_cols
    # ['Work_accident', 'time_spend_company', 'promotion_last_5years', 'id',
    #  'average_monthly_hours',  'last_evaluation',  'number_project']
    return num_cols, object_cols
num_cols, object_cols = num_cat_splitor(X)
# print(num_cols)
# print(object_cols)
# X[object_cols].values

In [11]:
# 特征数值筛选器
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

4. 数据处理Pipeline

In [12]:
# 数字特征
# 缺失值填充：SimpleImputer: https://blog.csdn.net/qq_43965708/article/details/115625768
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_cols)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

# 文字特征
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(object_cols)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

# 组合数字和文字特征
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
X_prepared = full_pipeline.fit_transform(X)

5. 尝试不同的模型

In [16]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,X_prepared,y,
                               scoring='neg_mean_squared_error',cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)
print(forest_rmse_scores)
print(forest_rmse_scores.mean())
print(forest_rmse_scores.std())

[0.17770875 0.18181102 0.18152506]
0.18034827697985442
0.0018700737014170534


6. 参数搜索

In [17]:
param_grid = [
    {'n_estimators' : [3,10,30,50,80],'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')
grid_search.fit(X_prepared,y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 50, 80]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')

In [19]:
# 最佳参数
grid_search.best_params_

{'max_features': 8, 'n_estimators': 80}

In [20]:
# 最优模型
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=80)

In [None]:
# 搜索结果
cv_result = grid_search.cv_results_
for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']):
    print(np.sqrt(-mean_score), params)

7. 特征重要性筛选

In [23]:

# 选择前 k 个最重要的特征

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]   

8. 最终完整Pipeline

In [None]:
k = 3
feature_importances = grid_search.best_estimator_.feature_importances_

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('forst_reg', RandomForestRegressor())
])

# 参数搜索
param_grid = [{
    'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(5, len(feature_importances) + 1)),
    'forst_reg__n_estimators' : [200,250,300,310,330],
    'forst_reg__max_features':[2,4,6,8]
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=10,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# 训练
grid_search_prep.fit(X,y)
grid_search_prep.best_params_
final_model = grid_search_prep.best_estimator_

In [None]:
# 预测
y_pred_test = final_model.predict(test)
result = pd.DataFrame()
result['id'] = test['id']
result['satisfaction_level'] = y_pred_test
result.to_csv('../../data/Employee_Satisfaction/rf_ML_pipeline.csv',index=False)

In [25]:
feature_importances

array([0.012652  , 0.00321519, 0.14278332, 0.19327538, 0.09005369,
       0.13698183, 0.2366679 , 0.01190325, 0.02384898, 0.0120757 ,
       0.01868533, 0.01173142, 0.00736184, 0.0064114 , 0.00574573,
       0.00527186, 0.00462312, 0.0061626 , 0.0068941 , 0.01211615,
       0.01002893, 0.01073024, 0.00755472, 0.01161603, 0.01160931])