# 4.7 Boosting dla klasyfikacji i regresji

## Boosting w przypadku klasyfikacji


In [10]:
import pandas as pd
import numpy as np
from joblib.testing import param

In [11]:
purchases_df = pd.read_parquet("../data/purchases_df.parquet").astype({
    "OperatingSystems": "category",
    "Browser": "category",
    "Region": "category",
    "TrafficType": "category",
    "Weekend": "int8",
})
purchases_df.sample(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
8217,1,95.333333,0,0.0,16,819.333333,0.0,0.015625,25.255642,0.0,Dec,2,2,7,2,New_Visitor,0,True
8986,0,0.0,0,0.0,22,876.25,0.0,0.004762,0.0,0.0,Dec,2,2,2,10,Returning_Visitor,0,False
9080,2,34.25,0,0.0,55,3227.483333,0.003636,0.016364,0.0,0.0,Nov,2,2,3,13,Returning_Visitor,0,False
2550,0,0.0,0,0.0,41,2213.0,0.004545,0.031061,0.0,0.2,May,2,2,5,4,Returning_Visitor,0,False
3745,5,54.0,0,0.0,15,304.5,0.0,0.010526,0.0,0.0,May,2,2,6,2,Returning_Visitor,0,False


In [12]:
X = purchases_df.drop(columns="Revenue")
y = purchases_df["Revenue"]

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [14]:
categorical_features = purchases_df.select_dtypes(["category", "object"]).columns
numerical_features = purchases_df.select_dtypes([int, float]).columns

categorical_transformer = Pipeline(steps=[
    ("one_hot_encoding", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(transformers=[
    ("categorical", categorical_transformer, categorical_features),
    ("numerical", "passthrough", numerical_features),
])


### AdaBoost w akcji

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

In [16]:
ab_param_grid = {
    "n_estimators": [2, 5, 10, 25, 100, 1000]
}

In [17]:
ab_classifier = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", GridSearchCV(AdaBoostClassifier(random_state=253),
                                param_grid=ab_param_grid, cv=5, scoring="f1", n_jobs=-1,
                                return_train_score=True))
])
ab_classifier.fit(X, y)



In [18]:
ab_classifier.named_steps["classifier"].best_score_

np.float64(0.5951048788297871)

### Test XGBoost

In [19]:
from xgboost import XGBClassifier

In [22]:
xgb_param_grid = {
    "n_estimators": [2, 5, 10, 25, 100],
    "max_depth": range(3, 9, 2),
    "learning_rate": [0.01, 0.1, 1.0]
}

In [24]:
xgb_classifier = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", GridSearchCV(XGBClassifier(random_state=253),
                                param_grid=xgb_param_grid, scoring="f1",
                                n_jobs=-1, verbose=1, cv=5, return_train_score=True))
])
xgb_classifier.fit(X, y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits




In [25]:
xgb_classifier.named_steps["classifier"].best_score_

np.float64(0.6480930321937495)

### Boosting dla regresji

### Przewidywanie cen mieszkań
W 4_4 osiągnięty został wynik metodami Random Forest oraz Extra Trees na poziomie MSE = 22.07

In [26]:
boston_df = pd.read_parquet("../data/boston_df.parquet")
boston_df.sample(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
299,0.05561,70.0,2.24,0.0,0.4,7.041,10.0,7.8278,5.0,358.0,14.8,371.58,4.74,29.0
101,0.11432,0.0,8.56,0.0,0.52,6.781,71.3,2.8561,5.0,384.0,20.9,395.58,7.67,26.5
476,4.87141,0.0,18.1,0.0,0.614,6.484,93.6,2.3053,24.0,666.0,20.2,396.21,18.68,16.7
459,6.80117,0.0,18.1,0.0,0.713,6.081,84.4,2.7175,24.0,666.0,20.2,396.9,14.7,20.0
106,0.1712,0.0,8.56,0.0,0.52,5.836,91.9,2.211,5.0,384.0,20.9,395.67,18.66,19.5


In [27]:
W = boston_df.drop(columns="MEDV")
z = boston_df["MEDV"]

In [28]:
from sklearn.ensemble import AdaBoostRegressor

In [29]:
ab_regressor = GridSearchCV(AdaBoostRegressor(random_state=253),
                            param_grid=ab_param_grid, scoring="neg_mean_squared_error",
                            n_jobs=8, verbose=1, cv=5, return_train_score=True)
ab_regressor.fit(W, z)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [30]:
ab_regressor.best_score_

np.float64(-23.24266022750503)

In [31]:
from xgboost import XGBRegressor

In [35]:
xg_regressor = GridSearchCV(XGBRegressor(random_state=253),
                            param_grid=xgb_param_grid,
                            verbose=True, cv=5, scoring="neg_mean_squared_error", n_jobs=-1, return_train_score=True)
xg_regressor.fit(W, z)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [36]:
xg_regressor.best_score_

np.float64(-18.149871111461316)

Udało się! XGBoost pozwolił na zbicie wartości $ MSE $ o niemal 4!