In [1]:
import pandas as pd
import numpy as np
import os 

import wandb
os.environ["WANDB_SILENT"] = "true"

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB
from catboost import CatBoostRegressor as CB

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_log_error, mean_squared_error

import functools
rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)


SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')


# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)

# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format


set_styles()

In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    path_original = 'data/abalone.csv'
    target = 'Rings'
    project = 'PGs04e04'
    num_folds = 5

In [3]:
mapper_sex = {'I':0, 'F':1, 'M':2}

df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map(mapper_sex)

# df_test = pd.read_csv(CFG.path_test).drop('id', axis=1)
# df_test['Sex'] = df_test['Sex'].map(mapper_sex)

df_original = pd.read_csv(CFG.path_original)
df_original['Sex'] = df_original['Sex'].map(mapper_sex)
df_original = df_original.rename(columns={'Shucked weight': 'Whole weight.1', 'Viscera weight': 'Whole weight.2'})

In [4]:
def get_lgb_params_grid():
    return {
        'max_depth':         {'min': 3,    'max': 10},
        'n_estimators':      {'min': 100,  'max': 1000},
        'learning_rate':     {'min': 0.01, 'max': 0.2},
        'reg_alpha':         {'min':0.0,   'max': 100.},
        'reg_lambda':        {'min':0.0,   'max': 100.},
        'subsample':         {'values': np.arange(0.5, 1.01, 0.1).tolist()},
        'boosting':          {'value': 'gbdt'},
        'random_state':      {'value': SEED},
        'objective':         {'value': 'regression'},
        'device':            {'value': 'gpu'},
     }


# def get_lgb_params_grid():
#     return {
#         'num_leaves':        {'min': 5,    'max': 500},
#         'n_estimators':      {'min': 100,  'max': 1000},
#         'min_child_samples': {'min': 10,   'max': 300},
#         'learning_rate':     {'min': 0.01, 'max': 0.2},
#         'reg_alpha':         {'min':0.0,   'max': 100.},
#         'reg_lambda':        {'min':0.0,   'max': 100.},
#         'subsample':         {'values': np.arange(0.5, 1.01, 0.1).tolist()},
#         'boosting':          {'value': 'gbdt'},
#         'random_state':      {'value': SEED},
#         'objective':         {'value': 'regression'},
#         'device':            {'value': 'gpu'},
#      }


def get_catboost_params_grid():
    return {
        'depth':             {'min': 3,    'max': 10},
        'iterations':        {'min': 100,  'max': 1000},
        'min_data_in_leaf':  {'min': 10,   'max': 300},
        'learning_rate':     {'min': 0.01, 'max': 0.2},
        'l2_leaf_reg':       {'min':0.0,   'max': 100.},
        'subsample':         {'values': np.arange(0.5, 1.01, 0.1).tolist()},
        'random_state':      {'value': SEED},
        'loss_function':     {'value': 'RMSE'},
        'verbose':           {'value': 0},
    }


def get_xgb_params_grid(use_log=False):
    return {
            'max_depth':         {'min': 3,    'max': 10},
            'n_estimators':      {'min': 100,  'max': 1000},
            'learning_rate':     {'min': 0.01, 'max': 0.2},
            'alpha':             {'min':0.0,   'max': 100.},
            'lambda':            {'min':0.0,   'max': 100.},
            'subsample':         {'values': np.arange(0.5, 1.01, 0.1).tolist()},
            'booster':           {'value': 'gbtree'},
            'random_state':      {'value': SEED},
            'objective':         {'value': 'reg:squarederror' if not use_log else 'reg:squaredlogerror'},
            'tree_method':       {'value': 'gpu_hist'},
            }


def get_config(name, param_grid):
    return {
        'name': f'sweep_{name}',
        'method': 'bayes',
        'metric': {'goal': 'minimize', 'name': 'score_mean'},
        'parameters': param_grid,
        }


def score_cv(X, cv, model, df_original=None):
    scores = []
    for fold, (idx_train, idx_val) in enumerate(cv.split(X, X[CFG.target])):
        X_train = X.loc[idx_train]
        if df_original is not None:
            X_train = pd.concat([X_train, df_original], axis=0)
        Y_train = X_train.pop(CFG.target)
        X_val = X.loc[idx_val]
        Y_val = X_val.pop(CFG.target)
        
        model.fit(X_train, Y_train)
        preds = model.predict(X_val).clip(1., 29.)
        score = rmsle(Y_val, preds)
        scores.append(score)
        
    return np.array(scores)


def objective_catboost():
    wandb.init(project=CFG.project)
    
    model = TransformedTargetRegressor(
                    CB(**wandb.config),
                    func=np.log1p, 
                    inverse_func=np.expm1)
    scores = score_cv(X, cv, model, df_original)

    wandb.log({f'score_{i}': sc for i, sc in enumerate(scores)})
    wandb.log({'score_mean': scores.mean()})    



def objective_lgb():
    wandb.init(project=CFG.project)
    
    model = TransformedTargetRegressor(
                    LGB(**wandb.config, num_leaves=2**wandb.config['max_depth'] - 1),
                    func=np.log1p, 
                    inverse_func=np.expm1)
    scores = score_cv(X, cv, model, df_original)

    wandb.log({f'score_{i}': sc for i, sc in enumerate(scores)})
    wandb.log({'score_mean': scores.mean()})    


def objective_xgb():
    wandb.init(project=CFG.project)
    
    model = TransformedTargetRegressor(
                    XGB(**wandb.config),
                    func=np.log1p, 
                    inverse_func=np.expm1)
    scores = score_cv(X, cv, model)

    wandb.log({f'score_{i}': sc for i, sc in enumerate(scores)})
    wandb.log({'score_mean': scores.mean()})    



def objective_xgb_log():
    wandb.init(project=CFG.project)
    
    model = XGB(**wandb.config)
    scores = score_cv(X, cv, model)

    wandb.log({f'score_{i}': sc for i, sc in enumerate(scores)})
    wandb.log({'score_mean': scores.mean()})    

In [5]:
key = ''
wandb.login(key=key)

True

In [6]:
N = 200
experiments = (
               # ('catboost_with_orig',     get_config('catboost_with_orig', get_catboost_params_grid()),           objective_catboost,   N),
               ('lgb_with_orig',          get_config('lgb_with_orig',      get_lgb_params_grid()),                objective_lgb,        N),
            #    ('xgb',     get_config('xgb',      get_xgb_params_grid(use_log=False)),   objective_xgb,      N),
            #    ('xgb_log', get_config('xgb_log',  get_xgb_params_grid(use_log=True)),    objective_xgb_log,  N),
               )

In [7]:
X = df_train
cv = StratifiedKFold(n_splits=CFG.num_folds, shuffle=True, random_state=SEED)

for label, config, objective, count in experiments:

    print(f'{TXT_ACC} {label} {TXT_RESET}')
    
    try:
        sweep_id = wandb.sweep(sweep=config, project=CFG.project)            
        wandb.agent(sweep_id, function=objective, count=count)        
    except:
        print('Something went wrong')
    finally:
        wandb.finish()

# Load sweep results

In [8]:
def get_sweep_results(wandb_api, sweep_id):
    print(sweep_id)
    sw = wandb_api.sweep(f"nasonova-alexandra/{CFG.project}/{sweep_id}")

    columns_metrics = [f'score_{i}' for i in range(CFG.num_folds)]

    runs = [(run.history()[columns_metrics].values[0], run.config) for run in sw.runs]
    
    configs = [r[1] for r in runs]
    scores  = [r[0] for r in runs]
    df = pd.DataFrame(configs)
    df[columns_metrics] = scores
    
    return df

In [9]:
wandb_api = wandb.Api()
sweeps = (  ('sweep_catboost_with_original', 'hrkn47d1'),
            ('sweep_lgb_with_original',      '24wrt6fu'))
for path_save, sweep_id in sweeps:
    df = get_sweep_results(wandb_api, sweep_id)
    df.to_csv(path_save+'.csv', index=False)

# Get best parameters for each fold

In [17]:
columns_metrics = [f'score_{i}' for i in range(5)]
for label, _ in sweeps:
    df = pd.read_csv(label+'.csv')
    df['score_mean'] = np.mean(df[columns_metrics], axis=1)
    best_vals = []
    for fold in range(CFG.num_folds):
        print(f'{TXT_ACC} {label}   fold {fold} {TXT_RESET}')
        print(df.sort_values(f'score_{fold}').iloc[0,:])
        best_vals_fold = df.sort_values(f'score_{fold}').iloc[0,:-6].to_dict()
        best_vals.append(best_vals_fold)
    
    print(f'{TXT_ACC} {label} {TXT_RESET}')
    print(best_vals)

[1m[38;5;254m[48;5;240m sweep_catboost_with_original   fold 0 [0m
depth                    7
verbose                  0
subsample           0.8000
iterations             974
l2_leaf_reg        20.6278
random_state            42
learning_rate       0.1444
loss_function         RMSE
min_data_in_leaf       103
score_0             0.1482
score_1             0.1492
score_2             0.1495
score_3             0.1492
score_4             0.1474
score_mean          0.1487
Name: 51, dtype: object
[1m[38;5;254m[48;5;240m sweep_catboost_with_original   fold 1 [0m
depth                    7
verbose                  0
subsample           0.5000
iterations             994
l2_leaf_reg        78.0573
random_state            42
learning_rate       0.1892
loss_function         RMSE
min_data_in_leaf        24
score_0             0.1483
score_1             0.1489
score_2             0.1494
score_3             0.1494
score_4             0.1475
score_mean          0.1487
Name: 151, dtype: object
