In [1]:
# 要放在最上面，因為下面會用到sklearn 會有runtime error
!pip uninstall -y scikit-learn
!pip install 'scikit-learn>=0.24.0'
!pip uninstall -y distributed
!pip install 'distributed==2.30.1'
!pip install auto-sklearn
import autosklearn

Uninstalling scikit-learn-0.24.2:
  Successfully uninstalled scikit-learn-0.24.2
Collecting scikit-learn>=0.24.0
  Using cached https://files.pythonhosted.org/packages/a8/eb/a48f25c967526b66d5f1fa7a984594f0bf0a5afafa94a8c4dbc317744620/scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.24.2
Uninstalling distributed-2.30.1:
  Successfully uninstalled distributed-2.30.1
Collecting distributed==2.30.1
  Using cached https://files.pythonhosted.org/packages/88/38/d9f0e31c15de18cb124d1ed33cf9c99c84f05f251ff6767e7573c217725b/distributed-2.30.1-py3-none-any.whl
Installing collected packages: distributed
Successfully installed distributed-2.30.1


In [2]:
import sklearn
import sklearn.datasets
import sklearn.model_selection

# 取得資料
x, y = sklearn.datasets.fetch_openml(data_id=31, return_X_y=True, as_frame=True)

# 切分資料
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.4, random_state=42)

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 24 to 102
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         600 non-null    category
 1   duration                600 non-null    float64 
 2   credit_history          600 non-null    category
 3   purpose                 600 non-null    category
 4   credit_amount           600 non-null    float64 
 5   savings_status          600 non-null    category
 6   employment              600 non-null    category
 7   installment_commitment  600 non-null    float64 
 8   personal_status         600 non-null    category
 9   other_parties           600 non-null    category
 10  residence_since         600 non-null    float64 
 11  property_magnitude      600 non-null    category
 12  age                     600 non-null    float64 
 13  other_payment_plans     600 non-null    category
 14  housing                 6

In [3]:
# 手動建立pipeline

from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [4]:
estimator_rf = RandomForestClassifier()

categorical_columns = [col for col in x_train.columns if x[col].dtype.name=='category']
encoder = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')
pipeline_rf = Pipeline([
    ('encoder', encoder),
    ('scaler', StandardScaler()),
    ('rf', estimator_rf),
])
pipeline_rf.fit(x_train, y_train)

Pipeline(steps=[('encoder',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['checking_status',
                                                   'credit_history', 'purpose',
                                                   'savings_status',
                                                   'employment',
                                                   'personal_status',
                                                   'other_parties',
                                                   'property_magnitude',
                                                   'other_payment_plans',
                                                   'housing', 'job',
                                                   'own_telephone',
                                                   'foreign

In [5]:
# metrics

prediction = pipeline_rf.predict(x_test)
acc = accuracy_score(y_test, prediction)
print(acc)

0.7575


In [10]:
prediction

array(['good', 'good', 'bad', 'good', 'good', 'good', 'good', 'bad',
       'good', 'good', 'good', 'bad', 'good', 'bad', 'bad', 'good',
       'good', 'good', 'good', 'good', 'good', 'bad', 'good', 'good',
       'good', 'bad', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'bad', 'good', 'good', 'bad',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'bad', 'bad', 'good', 'good', 'bad', 'good', 'good',
       'good', 'good', 'good', 'bad', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'bad', 'good', 'good', 'bad', 'bad',
       'good', 'good', 'good', 'good', 'bad', 'good', 'good', 'good',
       'good', 'good', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'bad', 'good', 'good', 'bad', 'good',
       'good', 'good', 'good

### 測試GBDT

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
pipeline_gbdt = Pipeline([
    ('encoder', encoder),
    ('scaler', StandardScaler()),
    ('gbdt', GradientBoostingClassifier()),
])
pipeline_gbdt.fit(x_train, y_train)
prediction_gbdt = pipeline_gbdt.predict(x_test)

print(accuracy_score(y_test, prediction_gbdt))

0.7375


In [8]:
from autosklearn.classification import AutoSklearnClassifier
# 更多可以查閱官網: https://automl.github.io/auto-sklearn/master/api.html

# modelling
estimator_auto = AutoSklearnClassifier(
    time_left_for_this_task=300,    # 秒
    seed=42,
    resampling_strategy='cv',
    n_jobs=1                        # 1 CPU
)
estimator_auto.fit(x_train, y_train)


# score
prediction_auto = estimator_auto.predict(x_test)
acc_auto = accuracy_score(y_test, prediction_auto)
print(acc_auto)

0.775


In [9]:
prediction_auto

array(['good', 'good', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'bad', 'good', 'bad', 'bad', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'bad', 'good', 'good', 'bad',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'bad', 'bad', 'good', 'good', 'bad', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'bad', 'bad',
       'good', 'good', 'good', 'good', 'bad', 'good', 'good', 'good',
       'good', 'good', 'bad', 'good', 'good', 'good', 'good', 'good',
       'good', 'good', 'good', 'good', 'good', 'good', 'bad', 'good',
       'good', 'good'

### 透過show_models可以看到結果
- 我認為可以快速發現比較適用的模型。
    - 隨機森林(0.56): 可見是一個表現較為良好的模型。
    - ExtraTrees(0.12)
    - 隨機森林(0.08): 不同的參數設定
    - GBDT(0.04)
    - more...

In [17]:
from pprint import pprint

pprint(estimator_auto.show_models())

("[(0.560000, SimpleClassificationPipeline({'balancing:strategy': 'none', "
 "'classifier:__choice__': 'random_forest', "
 "'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': "
 "'one_hot_encoding', "
 "'data_preprocessing:categorical_transformer:category_coalescence:__choice__': "
 "'minority_coalescer', "
 "'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', "
 "'data_preprocessing:numerical_transformer:rescaling:__choice__': "
 "'standardize', 'feature_preprocessor:__choice__': 'no_preprocessing', "
 "'classifier:random_forest:bootstrap': 'True', "
 "'classifier:random_forest:criterion': 'gini', "
 "'classifier:random_forest:max_depth': 'None', "
 "'classifier:random_forest:max_features': 0.5, "
 "'classifier:random_forest:max_leaf_nodes': 'None', "
 "'classifier:random_forest:min_impurity_decrease': 0.0, "
 "'classifier:random_forest:min_samples_leaf': 1, "
 "'classifier:random_forest:min_samples_split': 2, "
 "'classifier:random_fores

## 總結
- 比起繁瑣的人工特徵處理、模型選擇，AutoML提供一個快速得到還不錯模型的方法，但效果不會是最好的。
- 有時候暴力破解不失為一個好方法
- 但需要題目定義清楚，也就是資料科學家最重要的工作
- 簡單的任務可以達到，但更複雜、非監督式任務就不適用。