In [1]:
!pip install -e ../..

Obtaining file:///home/egor/Documents/FCApy
Installing collected packages: fcapy
  Attempting uninstall: fcapy
    Found existing installation: fcapy 0.1.2
    Uninstalling fcapy-0.1.2:
      Successfully uninstalled fcapy-0.1.2
  Running setup.py develop for fcapy
Successfully installed fcapy


In [2]:
import pandas as pd

# Load the datasets

In [3]:
data_dict = {}

## California housing

In [4]:
from sklearn.datasets import fetch_california_housing

In [5]:
data = fetch_california_housing(as_frame=True)

ds = data['data']
y_feat = data['target_names'][0]
ds[y_feat] = data['target']

data_dict['calhouse'] = {
    'ds': ds,
    'train_feats': data['feature_names'],
    'cat_feats': [],
    'y_feat': y_feat
}

### Boston

In [6]:
from sklearn.datasets import load_boston

In [7]:
data = load_boston()

y_feat = 'price'
fs = data['feature_names']
ds = pd.DataFrame(data['data'], columns=fs)
ds[y_feat] =data['target']

data_dict['boston'] = {
    'ds': ds,
    'train_feats': fs,
    'cat_feats': [],
    'y_feat': y_feat
}

### Diabetes

In [8]:
from sklearn.datasets import load_diabetes

In [9]:
data = load_diabetes(as_frame=True)

ds = data['data']
y_feat = 'disease'
ds[y_feat] = data['target']

data_dict['diabetes'] = {
    'ds': ds,
    'train_feats': data['feature_names'],
    'cat_feats': [],
    'y_feat': y_feat
}

In [10]:
from sklearn.preprocessing import LabelEncoder

for data_name in data_dict.keys():
    ds = data_dict[data_name]['ds']
    cat_feats = data_dict[data_name]['cat_feats']
    ds[cat_feats] = ds[cat_feats].astype(str)
    for f in cat_feats:
        ds[f+'_le'] = LabelEncoder().fit_transform(ds[f])
    data_dict[data_name]['ds'] = ds
    #train_feats_le = [f+('_le' if f in cat_feats else '') for f in train_feats]

# Test models

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datetime import datetime

from tqdm import tqdm

In [12]:
import numpy as np
from fcapy.mvcontext.mvcontext import MVContext
from fcapy.mvcontext import pattern_structure as PS
from fcapy.ml.decision_lattice import DecisionLatticeRegressor

from collections.abc import Iterable

In [13]:
import inspect

In [14]:
def form_output(model, params, dt, preds_train, preds_test):
    output = {
        'model': model,
        'params': params,
        'dt': dt,
        'preds_train': preds_train,
        'preds_test': preds_test,
    }
    return output

In [15]:
def fit_decision_tree(X_train, y_train, X_test, all_params):
    params = inspect.signature(DecisionTreeRegressor.__init__).parameters
    params = {k: v for k,v in all_params.items() if k in params}
    
    model = DecisionTreeRegressor(**params)
    t1 = datetime.now()
    model.fit(X_train, y_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    
    return form_output(model, params, dt, preds_train, preds_test)

def fit_random_forest(X_train, y_train, X_test, all_params):
    params = inspect.signature(RandomForestRegressor.__init__).parameters
    params = {k: v for k,v in all_params.items() if k in params}
    
    model = RandomForestRegressor(**params)
    t1 = datetime.now()
    model.fit(X_train, y_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    
    return form_output(model, params, dt, preds_train, preds_test)

def fit_gradient_boosting(X_train, y_train, X_test, all_params):
    params = inspect.signature(GradientBoostingRegressor.__init__).parameters
    params = {k: v for k,v in all_params.items() if k in params}
    
    model = GradientBoostingRegressor(**params)
    t1 = datetime.now()
    model.fit(X_train, y_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    
    return form_output(model, params, dt, preds_train, preds_test)

def fit_xgboost(X_train, y_train, X_test, all_params):
    #params = inspect.signature(XGBRegressor.__init__).parameters
    params = {'n_estimators', 'max_depth'}
    params = {k: v for k,v in all_params.items() if k in params}
    
    model = XGBRegressor(**params)
    t1 = datetime.now()
    model.fit(X_train, y_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    
    return form_output(model, params, dt, preds_train, preds_test)

def fit_model(X_train, y_train, X_test, all_params, model_class):
    if model_class == DecisionTreeRegressor:
        output = fit_decision_tree(X_train, y_train, X_test, all_params)
    elif model_class == RandomForestRegressor:
        output = fit_random_forest(X_train, y_train, X_test, all_params)
    elif model_class == GradientBoostingRegressor:
        output = fit_gradient_boosting(X_train, y_train, X_test, all_params)
    elif model_class == XGBRegressor:
        output = fit_xgboost(X_train, y_train, X_test, all_params)
    else:
        raise ValueError
    
    return output

In [16]:
def fit_dl_dt(K_train, K_test, model):
    t1 = datetime.now()
    model = DecisionLatticeRegressor.from_decision_tree(model, K_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(K_train)
    preds_test = model.predict(K_test)
    
    return form_output(model, {}, dt, preds_train, preds_test)

def fit_dl_rf(K_train, K_test, model):
    t1 = datetime.now()
    model = DecisionLatticeRegressor.from_random_forest(model, K_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(K_train)
    preds_test = model.predict(K_test)
    
    return form_output(model, {}, dt, preds_train, preds_test)

def fit_dl_gb(K_train, K_test, model):
    t1 = datetime.now()
    model = DecisionLatticeRegressor.from_gradient_boosting(model, K_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(K_train)
    preds_test = model.predict(K_test)
    
    return form_output(model, {}, dt, preds_train, preds_test)

def fit_dl_xgb(K_train, K_test, model):
    t1 = datetime.now()
    model = DecisionLatticeRegressor.from_xgboost(model, K_train)
    t2 = datetime.now()
    dt = (t2-t1).total_seconds()
    
    preds_train = model.predict(K_train)
    preds_test = model.predict(K_test)
    
    return form_output(model, {}, dt, preds_train, preds_test)

def fit_dl_from_model(K_train, K_test, model):
    if isinstance(model, DecisionTreeRegressor):
        output = fit_dl_dt(K_train, K_test, model)
    elif isinstance(model, RandomForestRegressor):
        output = fit_dl_rf(K_train, K_test, model)
    elif isinstance(model, GradientBoostingRegressor):
        output = fit_dl_gb(K_train, K_test, model)
    elif isinstance(model, XGBRegressor):
        output = fit_dl_xgb(K_train, K_test, model)
    else:
        raise ValueError
        
    return output

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
def weighted_average_percentage_error(y_true, y_pred):
    return np.abs((y_true-y_pred)/y_true).mean()*100
def root_mean_squared_percentage_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)/y_true.mean()

In [19]:
def calc_metrics(y_train, y_test, preds_train, preds_test):
    metrics_dict = {}
    for m_name, m_func in [
        ('mse', mean_squared_error),
        ('mae', mean_absolute_error),
        ('r2',  r2_score),
        ('wape', weighted_average_percentage_error),
        ('rmse_perc', root_mean_squared_percentage_error)
    ]:
        for ds_type in ['train', 'test']:
            y = y_train if ds_type == 'train' else y_test
            p = preds_train if ds_type == 'train' else preds_test
            metrics_dict[f"{m_name}_{ds_type}"] = m_func(y, p)

    return metrics_dict

In [20]:
def create_stat_dict(cls, data_name, kf_idx, train_idxs, test_idxs, params, metrics_dict, dt):
    stat = {
        'model': cls.__name__,
        'ds': data_name,
        'fold_id': kf_idx,
        'train_size': len(train_idxs),
        'test_size': len(test_idxs),
        'time': dt
    }
    stat = dict(stat, **{f"param_{k}":v for k,v in params.items()})
    stat = dict(stat, **{f"metric_{k}":v for k,v in metrics_dict.items()})
    return stat

In [21]:
def form_stat_dict(ens_model_output, dl_model_output, ens_model_metrics, dl_model_metrics, dl_ens_metrics, data_name, kf_idx):
    stat = {
        'ensemble_model': ens_model_output['model'].__class__.__name__,
        'ds': data_name,
        'fold_id': kf_idx,
        'train_size': len(ens_model_output['preds_train']),
        'test_size': len(ens_model_output['preds_test']),
        'ensemble_time': ens_model_output['dt'],
        'dl_time': dl_model_output['dt'],
    }
    stat = dict(stat, **{f"param_{k}":v for k,v in ens_model_output['params'].items()})
    stat = dict(stat, **{f"ensemble_metric_{k}":v for k,v in ens_model_metrics.items()})
    stat = dict(stat, **{f"dl_metric_{k}":v for k,v in dl_model_metrics.items()})
    stat = dict(stat, **{f"dl_ens_metric_{k}":v for k,v in dl_ens_metrics.items()})
    
    return stat

In [22]:
random_state = 42

model_params_vars = [
    (DecisionTreeRegressor, dict(max_depth=10)),
    (RandomForestRegressor, dict(n_estimators=10, max_depth=6)),
    (GradientBoostingRegressor, dict(n_estimators=10, max_depth=6)),
    (XGBRegressor, dict(n_estimators=10, max_depth=6)),
]
model_params_vars = [
    (cls, dict({'random_state':random_state}, **params))
    for cls, params in model_params_vars
]

In [23]:
%%time

kf = KFold(n_splits=5)
stat_ds = []

for data_name, data in tqdm(data_dict.items(), desc='iterate datasets'):
    ds, train_feats, cat_feats, y_feat = data['ds'], data['train_feats'], data['cat_feats'], data['y_feat']
    train_feats_le = [f+('_le' if f in cat_feats else '') for f in train_feats]
    
    pattern_types = {f: PS.IntervalPS for f in train_feats_le}

    for kf_idx, idxs in tqdm(enumerate(kf.split(ds[train_feats])), desc='KFold', total=kf.n_splits, leave=False):
        train_idxs, test_idxs = idxs
        ds_train, ds_test = ds.loc[train_idxs], ds.loc[test_idxs]
        
        #if kf_idx==1:
        #    break
            
        X_train, X_test = ds_train[train_feats_le], ds_test[train_feats_le]
        y_train, y_test = ds_train[y_feat], ds_test[y_feat]
        
        K_train = MVContext(X_train.values, pattern_types, attribute_names=train_feats_le)
        K_test = MVContext(X_test.values, pattern_types, attribute_names=train_feats_le)
        
        for cls, params in tqdm(model_params_vars, leave=False, desc='iterate models'):
            ens_model_output = fit_model(X_train, y_train, X_test, params, cls)    
            
            dl_model_output = fit_dl_from_model(K_train, K_test, ens_model_output['model'])
            
            ens_metrics = calc_metrics(y_train, y_test, ens_model_output['preds_train'], ens_model_output['preds_test'])
            dl_metrics = calc_metrics(y_train, y_test, dl_model_output['preds_train'], dl_model_output['preds_test'])
            
            dl_ens_metrics = calc_metrics(ens_model_output['preds_train'], ens_model_output['preds_test'], 
                                          dl_model_output['preds_train'], dl_model_output['preds_test'])
            
            
            stat = form_stat_dict(ens_model_output, dl_model_output, ens_metrics, dl_metrics, dl_ens_metrics, data_name, kf_idx)
            stat_ds.append(pd.Series(stat))
            
                                    
                    
        pd.concat(stat_ds,1, sort=False).T.to_csv('tmp_evaluation_regr.csv')
pd.concat(stat_ds,1, sort=False).T.to_csv('evaluation_regr_full_10est6depth.csv')
!rm tmp_evaluation_regr.csv

iterate datasets:   0%|          | 0/3 [00:00<?, ?it/s]
KFold:   0%|          | 0/5 [00:00<?, ?it/s][A

iterate models:   0%|          | 0/4 [00:00<?, ?it/s][A[A

iterate models:  25%|██▌       | 1/4 [00:25<01:16, 25.35s/it][A[A

iterate models:  50%|█████     | 2/4 [03:39<02:31, 75.94s/it][A[A

iterate models:  75%|███████▌  | 3/4 [07:04<01:54, 114.64s/it][A[A

iterate models: 100%|██████████| 4/4 [10:43<00:00, 146.12s/it][A[A

                                                              [A[A
KFold:  20%|██        | 1/5 [10:44<42:56, 644.20s/it][A

iterate models:   0%|          | 0/4 [00:00<?, ?it/s][A[A

iterate models:  25%|██▌       | 1/4 [00:30<01:32, 30.99s/it][A[A

iterate models:  50%|█████     | 2/4 [04:16<02:58, 89.28s/it][A[A

iterate models:  75%|███████▌  | 3/4 [08:19<02:15, 135.39s/it][A[A

iterate models: 100%|██████████| 4/4 [12:04<00:00, 162.49s/it][A[A

                                                              [A[A
KFold:  40%|████     

CPU times: user 57min 24s, sys: 1min 2s, total: 58min 26s
Wall time: 57min 8s



