### Esercizi Extra

In [None]:
# install
#%conda install catboost
#%conda install xgboost
#%conda install lightgbm

In [2]:
# import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import sklearn
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
import os
from urllib.request import urlretrieve

In [3]:
# utility function
def download(file, url):
    if not os.path.isfile(file):
        urlretrieve(url, file)
    return None

def print_grid_search_eval(X, y, grid_search, full_output=False):
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X)
    print(f"Best Params: \n{grid_search.best_params_}")
    print(f"      MSE: {mean_squared_error(y, y_pred):12.4f}")
    print(f"R-squared: {r2_score(y, y_pred):12.4f}")
    print(f"    RMSPE: {rmspe(y, y_pred):12.4f}")
    if full_output :
        return pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score")
    else :
        return None

def extract_date_fields(X):
    result = {}
    for col in X.columns:
        result[f"{col}_day"]       = X[col].dt.day        # giorno del mese (1-31)
        result[f"{col}_month"]     = X[col].dt.month      # mese (1-12)
        result[f"{col}_dayofweek"] = X[col].dt.dayofweek  # giorno della settimana (0-6)
    return pd.DataFrame(result)

In [4]:
# main
data_train = pd.read_csv(
    "rossmann_train.csv.gz",
    parse_dates=["Date"],
    compression="gzip",
)
data_val = pd.read_csv(
    "rossmann_valid.csv.gz",
    parse_dates=["Date"],
    compression="gzip",
)

y_train = data_train["Sales"]
y_val = data_val["Sales"]

data_train_sample = data_train.sample(60000, random_state=42)
y_train_sample = y_train.reindex_like(data_train_sample)

numeric_vars = [
    "CompetitionOpen", "CompetitionDistance",
    "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear",
    "Promo2SinceWeek", "Promo2SinceYear", "PromoOpen"
]
binary_vars = ["Promo", "SchoolHoliday", "Promo2", "IsPromoMonth"]
categorical_vars = ["StateHoliday", "StoreType", "Assortment"]

warnings.filterwarnings('ignore')        
kf = KFold(5, shuffle=True, random_state=42)

date_transformer = FunctionTransformer(extract_date_fields)
date_ft = date_transformer.fit_transform(data_train[["Date"]])
date_val_ft = date_transformer.fit_transform(data_train[["Date"]])

Ricreare il modello, simile a `date_transformer`, e addestrarlo trattando `Date_dayofweek` come variabile categorica

In [17]:
categorical_vars_con_dow = categorical_vars + ["Date_dayofweek"]
numeric_vars_con_dow = numeric_vars + ["Date_day","Date_month"]
model = Pipeline([
    ("col_t", ColumnTransformer([
        ("numeric" , StandardScaler(), numeric_vars_con_dow + binary_vars),
        ("categoric" , OneHotEncoder(),  categorical_vars_con_dow)
    ], remainder="drop")),
    ("Ridge", Ridge())
])

data_train_sample_dow = pd.DataFrame(np.c_[data_train,date_ft]).sample(60000, random_state=42)
y_train_sample_dow = pd.DataFrame(np.c_[y_train,date_ft]).reindex_like(data_train_sample_dow)

print(data_val.shape)
print(y_val.shape)
model.fit(data_train_sample_dow, y_train_sample_dow)
score = model.score(data_val, y_val)
nf = model.n_features_in_
print(nf, score)

(40282, 18)
(40282,)


ValueError: A given column is not a column of the dataframe

In [None]:
dio = date_transformer.fit_transform(data_train[["Date"]])
pd.DataFrame(dio)

In [None]:
data_2_data = np.c_[
    data_train,
    dio
]
data_2_index = np.array(data_train.columns.values.tolist()+dio.columns.values.tolist())   
pd.DataFrame(data=data_2_data,columns=data_2_index)
#print(data_train.columns.values.tolist())
#print(dio.columns.values.tolist())
#print(data_2_index.shape)

In [9]:
date_transformer = FunctionTransformer(extract_date_fields)
newclm = date_transformer.fit_transform(data_train[["Date"]]).sample(5, random_state=42).columns.values.tolist()
model = Pipeline([("col_t",
    ColumnTransformer([
        ("num" , StandardScaler(), numeric_vars + binary_vars),
        ("date", Pipeline([("t",date_transformer),("cat_2",OneHotEncoder())]), ["Date"]),
        ("cat" , OneHotEncoder(),  categorical_vars)
    ],remainder="passthrough")),
    ("Ridge", Ridge())
])

model.fit(data_train_sample, y_train_sample)
score = model.score(data_val, y_val)
nf = model.n_features_in_
print(nf, score)
date_transformer = FunctionTransformer(extract_date_fields)
newclm = date_transformer.fit_transform(data_train[["Date"]]).sample(5, random_state=42).columns.values.tolist()
model = Pipeline([("col_t",
    ColumnTransformer([
        ("num" , StandardScaler(), numeric_vars + binary_vars),
        #("date", Pipeline([("t",date_transformer),("cat_2",OneHotEncoder())]), ["Date"]),
        ("date", date_transformer, ["Date"]),
        ("cat" , OneHotEncoder(),  categorical_vars)
    ],remainder="drop")),
    ("Ridge", Ridge())
])

model.fit(data_train_sample, y_train_sample)
score = model.score(data_val, y_val)
nf = model.n_features_in_
print(nf, score)

18 0.999933410547048
18 0.2573737052973013


In [None]:
#grid = {
#    
#}
#gs = GridSearchCV(r_model, grid, cv=kf)
#gs.fit(data_train_sample, y_train_sample)
#print(gs.best_params_)
#print(gs.score(data_val, y_val))
#pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score")

Ricercare gli iperparametri migliori (con e senza tutte le features) con il metodo GridSearch.

In [None]:
lgbm_t = LGBMRegressor()
lgbm_grid_t = {
    "boosting_type" : ["gbdt","dart","goss"],
    "num_leaves" : [10, 20, 31]
}
gs_lgbm = GridSearchCV(lgbm_t, lgbm_grid_t, cv=kf)
gs_lgbm.fit(X_train, y_train)
print_grid_search_eval(X_val, y_val, gs_lgbm)

pl_xgbr_t = Pipeline([
    ("scaler", None),
    ("xgbr", XGBRegressor(objective='reg:squarederror', reg_lambda=1.0))
])
xgbr_grid_t = {
    "scaler" : [None, StandardScaler()],
    "xgbr__n_estimators" : [50],
    "xgbr__reg_alpha" : [0.1,0.2]
}
gs_xgbr = GridSearchCV(pl_xgbr_t, xgbr_grid_t, cv=kf)
gs_xgbr.fit(X_train, y_train)
print_grid_search_eval(X_val, y_val, gs_xgbr)

catbm_t = CatBoostRegressor();
catbm_grid_t = {
    "n_estimators" : [100, 500, 1000]
}
gs_catbm = GridSearchCV(catbm_t, catbm_grid_t, cv=kf)
gs_catbm.fit(X_train, y_train)
print_grid_search_eval(X_val, y_val, gs_catbm)