### ***Main info*** ###

***Project description:*** https://www.kaggle.com/competitions/bike-sharing-demand/overview/description

***Project goal:*** combine historical usage patterns with weather data in order to forecast bike rental demand in the Capital Bikeshare program in Washington, D.C.

***Suggested evaluation metric:*** Root Mean Squared Logarithmic Error (RMSLE)

***Other used evalutaion metrics*** Mean Absolute Error(MAE), Mean Squared Error(MSE), Root Mean Squared Error(RMSE), R Squared (R2)

### ***0. Project preparation*** ###

---

In [None]:
# main upgrades
!pip install --upgrade neptune-client
!pip install --upgrade neptune

In [2]:
# main imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn import set_config
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import catboost as ctb
import lightgbm as lgbm
import xgboost as xgb

import eli5
import time
import warnings
import json
from tqdm import tqdm
import neptune
from itertools import product
from collections import Counter
import re
import pickle

# minor settings
plt.style.use('ggplot')
%matplotlib inline
set_config(display='diagram')
warnings.filterwarnings('ignore', category=DeprecationWarning)

# global variables
PAD = 20
RANDOM_STATE = 0

In [3]:
# version check
def show_version(module_object: object, n: int = PAD) -> str:
    '''
    Check version of different libraries
    '''
    module_name = getattr(module_object, '__name__')
    module_ver = getattr(module_object, '__version__')
    dots = '.' * (n - len(module_name))
    
    print (f'{module_name}{dots}{module_ver}')


!python --version
module_list = [np, pd, sklearn, eli5, neptune]
for module in module_list:
    show_version(module)

Python 3.7.5
numpy...............1.20.0
pandas..............1.2.4
sklearn.............0.24.2
eli5................0.11.0
neptune.............0.15.2


In [4]:
# run neptune server
def init_neptune():
    '''
    Initialize neptune project
    '''
    with open('neptune_credentials') as f:
        neptune_credentials = json.load(f)

    run = neptune.init(
        api_token = neptune_credentials['API_TOKEN'],
        project_qualified_name = neptune_credentials['PROJECT']
    )
    return run

### ***1. Load data*** ###

---

In [5]:
df_train = pd.read_csv('inputs/train.csv')

In [6]:
weather_dict = {1: ['Clear', 'Few clouds', 'Partly cloudy'],
2: ['Mist + Cloudy', 'Mist + Broken clouds', 'Mist + Few clouds', 'Mist'],
3: ['Light Snow', 'Light Rain + Thunderstorm + Scattered clouds', 'Light Rain + Scattered clouds'],
4: ['Heavy Rain + Ice Pallets + Thunderstorm + Mist', 'Snow + Fog']}


### ***2. Custom data classes*** ###

---

In [7]:
class DataTransformer():
    '''
    Change the initial dataset into a new one based
    on passed function
    '''
    # copy parameter introduced to prevent SettingwithCopyWarning
    # https://www.dataquest.io/blog/settingwithcopywarning/
    def __init__(self, func, copy = True, **kwargs):
        self.func = func
        self.copy = copy

    def transform(self, input_df, **transform_params):
        input_df_ = input_df if not self.copy else input_df.copy()
        return self.func(input_df_,)

    def fit(self, X, y=None, **fit_params):
        return self

In [8]:
class ColumnSelector():
    '''
    Return a dataframe with predefined columns only
    '''

    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
class ColumnDroper():
    '''
    Return a dataframe without selected columns
    '''
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

### ***3. Custom feature functions*** ###

---

#### ***3.1. Add or change features*** ####

In [10]:
def make_cols_from_datetime(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Convert a column to datetime format and create new columns: 
    'year', 'month', 'day', 'hour'
    
    Arguments:
        dataset: pandas DataFrame 
    
    Returns:
        dataset: transformed pandas DataFrame
    '''

    # convert string to datetime type
    dataset['datetime'] = pd.to_datetime(dataset['datetime'])

    # make new columns from datetime column
    dataset['year'] = dataset['datetime'].dt.year
    dataset['month'] = dataset['datetime'].dt.month
    dataset['day'] = dataset['datetime'].dt.day
    dataset['hour'] = dataset['datetime'].dt.hour
    dataset['dayofweek'] = dataset['datetime'].dt.dayofweek
    dataset['weekend'] = dataset['dayofweek'].map(lambda x: int(x in [6,7]))
    return dataset

In [11]:
def seasons_change(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Set proper season duration and change their representation according
    to dataset legend

    Argument:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''

    changes = [
    ('2011-01-01', '2011-03-19', 4),
    ('2011-03-20', '2011-06-20', 1),
    ('2011-06-21', '2011-09-22', 2),
    ('2011-09-23', '2011-12-20', 3),
    ('2011-12-21', '2012-03-19', 4),
    ('2012-03-20', '2012-06-19', 1),
    ('2012-06-20', '2012-09-21', 2),
    ('2012-09-22', '2012-12-20', 3),
    ('2012-12-21', '2012-12-31', 4),
     ]

    for (start_date, end_date, new_season) in changes:
        dataset.loc[between_dates(dataset, start_date, '00', end_date, '23').index,'season'] = new_season
    return dataset

In [12]:
def generate_agg_features(dataset: pd.DataFrame, agg_name = np.median) -> pd.DataFrame:
    '''
    Calculate a monthly agg_function (like mean, median...) and add it to the dataframe

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: DataFrame with an additional column which contains the agg_function
        of bike shares in a month
    '''
    agg_dataset = dataset[['month', 'year', 'count']].groupby(['month', 'year']).agg(agg_name)
    agg_dataset = agg_dataset.reset_index()
    agg_dataset = agg_dataset.rename(columns = {'count':str(agg_name.__name__)})
    return pd.merge(dataset, agg_dataset, on=['month', 'year'], how = 'left')

In [13]:
def min_max_normalize_feats(dataset: pd.DataFrame, feats: list = []) -> pd.DataFrame:
    '''
    Normalize selected features. Equivalent of MinMaxScaler

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''
    for feature_name in feats:
        if feature_name in dataset.columns.tolist():
            max_value = dataset[feature_name].max()
            min_value = dataset[feature_name].min()
            dataset[feature_name] = (dataset[feature_name] - min_value) / (max_value - min_value)
    return dataset

In [14]:
def tokenize_weather(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Change integers in 'weather' column (1, 2, 3, ...) into a list of weather
    phenomena

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        initial dataframe with one extra column (weather_phenomena)
    '''
    
    # split weather phenomena into individual elements
    def weather_to_indicidual(x):
        x = ','.join([elem for elem in x])
        x = re.split(r'[,\+]', x)

        return list(set([elem.strip() for elem in x]))

    # change integers in 'weather' column  into list of individual weather phenomena
    dataset['weather_phenomena'] = dataset['weather'].map(weather_dict).apply(lambda x: weather_to_indicidual(x))

    return dataset


def deconstruct_weather_tokens(dataset: pd.DataFrame, token_col_name: str = 'weather_phenomena') -> pd.DataFrame:
    '''
    One-hot-encode individual weather phenomena and add them to initial dataframe

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        initial dataframe with an extra columns containing 0 or 1; each column
        represents an individual weather phenomena
    '''

    # make a list of all individual meteorological phenomena 
    all_phenomena = list(dataset[token_col_name].explode().unique())

    def return_cols_with_individual_phenomena(phenomena: pd.Series) -> pd.DataFrame:
        '''
        Build a dataframe (X) with all_phenomena as columns and put 0 or 1 in 
        rows representing the presence or absence of a particular phenomena
        '''

        def phenomenon_in_phenomena(phenomena: pd.Series) -> list:
            return [int(phenomenon in phenomena) for phenomenon in all_phenomena]
    
        X = phenomena.map(phenomenon_in_phenomena).apply(pd.Series)
        X.columns = all_phenomena
        return X 
    
    X = return_cols_with_individual_phenomena(dataset['weather_phenomena'])

    # concatenate columns to original dataset
    return pd.concat([dataset, X], axis=1)



In [15]:
def make_weather_embeddings(dataset: pd.DataFrame) -> np.array:
    '''
    Return unique vector representations of the weather column
    
    Arguments: 
        dataset: pandas DataFrame
    
    Returns:
        array of 4 lists (4 types of weather), each list containing 384 elements
    '''
    from sentence_transformers import SentenceTransformer
    
    # randomly chosen pretrained model
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


    # column of unique meteo phenomena in a 'str' form
    uni_meteo_weather = dataset['weather_phenomena'].map(lambda x: ' '.join(x)).unique() 

    # vector representatnions of 'uni_meteo_weather'
    embeddings = model.encode(uni_meteo_weather, convert_to_tensor = False)
    return embeddings

In [16]:
def vectorize_weather_nlp(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Return vector representations of the weather column

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset with 384 more columns containing vector representation of sentences
        (meteorological phenomena contatenated into a string are treated as sentences)
    '''

    # unique integers (1, 2, 3, 4) repesenting the weather
    uni_num_weather = dataset['weather'].unique()
    
    # unique embeddings of weather phenomena
    embeddings = make_weather_embeddings(dataset)

    # dictionary for pandas mapping
    embeddings_dict = dict(zip(uni_num_weather, embeddings))

    # one additional column containing a 384-element list in each cell
    dataset['weather_embeddings'] = dataset['weather'].map(embeddings_dict)

    # split 'weather_embeddings' into 384 individual columns and concat them to the original dataset
    dataset = pd.concat([dataset, pd.DataFrame(dataset['weather_embeddings'].tolist())], axis = 1)
    return dataset

In [17]:
def add_cos_sim_weather(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Calculate cosinus similarity between 4 weather types and concat
    4 additional columns to the original dataset containing calculated
    similarity

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        original dataset with 4 extra columns representing cosine similarity 
        between weather types
    '''

    # construct 4x4 array of cosine similarity
    cos_sims_array = peek_weather_cos_sim(dataset, return_dataframe= False)

    # dictionary with cos_sims for pandas mapping 
    # {1: [1.000000 0.897642 0.728108 0.566511], 2: [...], 3: [...], 4: [...]}
    cos_sims_dict = dict(zip(list(range(1,5)), cos_sims_array))

    # map dictionary to pandas DataFrame as a new column
    dataset['cos_sims'] = dataset['weather'].map(cos_sims_dict)

    # deconstruct mapped column to 4 individual columns
    cos_sims_new_columns = pd.DataFrame(dataset['cos_sims'].tolist(), columns = [f'cos_sim_weather_{n}' for n in list(range(1, 5))])

    # delete not needed column
    del dataset['cos_sims']

    return pd.concat([dataset, cos_sims_new_columns], axis = 1)

In [18]:
def transform_columns(dataset: pd.DataFrame, feats: list, transformer: sklearn.preprocessing._data) -> pd.DataFrame:
    '''
    Use a transformer to transfor selected columns
    Examples of transformers: OrdinalEncoder, MinMaxSxaler, Normalizer, StandardScaler

    Arguments:
        dataset: pandas DataFrame
        
    Returns:
        dataframe with transformed columns
    '''
    dataset[feats] = transformer.fit_transform(dataset[feats])
    return dataset


In [19]:
def transform_dataset(dataset: pd.DataFrame, pipeline: Pipeline) -> pd.DataFrame:
    '''
    Return a dataset after pipeline transformations

    Arguments:
        dataset: pandas DataFrame

    Returns: 
        dataset after pipeline transformation
    '''
    return pipeline.fit_transform(dataset)

#### ***3.2. Select features*** ####

In [20]:
def between_dates(dataset: pd.DataFrame, start_date: str, start_time: str, end_date: str, end_time: str) -> pd.DataFrame:
    '''
    Show dataframe between two dates and timestamps
    
    Arguments:
        dataset: pandas DataFrame 
        start_date: date which the dataset must be trimmed from
        start_time: hour of day from start_date
        end_date: date which the dataset must be trimmed to
        end_time: hour of day from end_date

    Returns:
        A DataFrame between (start_date, start_time) and (end_date, end_time)
    '''

    start_dt = f'{start_date} {start_time}:00:00'
    end_dt = f'{end_date} {end_time}:00:00'
    mask = (dataset['datetime'] >= start_dt) & (dataset['datetime'] <= end_dt)
    return dataset[mask]

In [21]:
def drop_obvious(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Return a dataset withouth some obviously unneeded columns

    Arguments:
        dataset: pandas DataFrame

    Returns:
        dataset: pandas DataFrame
    '''
    black_list = ['Unnamed: 0', 'datetime', 'casual', 'registered']
    
    feats = [feat for feat in dataset.columns.tolist() if feat not in black_list]

    return dataset[feats]

In [22]:
def select_dtypes(dataset: pd.DataFrame, dtypes = np.number) -> pd.DataFrame:
    '''
    Return a dataset with a specific datatype

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''
    return dataset.select_dtypes(include=dtypes)


In [23]:
def make_X_y(dataset: pd.DataFrame, test_size: int = 0.3):
    '''
    Split dataframe info train and valid set
    
    Arguments:
        dataset: pandas DataFrame
        test_size: test size split
    
    Returns:
        two DataFrames and two DataSeries containing independent and target variables
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(dataset.drop('count', axis = 1), dataset['count'], test_size = test_size, random_state = RANDOM_STATE)
    return X_train, X_valid, y_train, y_valid

#### ***3.3. Additional info features*** ####

In [24]:
def peek_weather_cos_sim(dataset: pd.DataFrame, return_dataframe = True):
    '''
    Show cosine similarity between different types of weather for a transformed dataset
    
    Arguments: 
        dataset: pandas DataFrame (transformed dataset containing 'weather_phenomena' column)
    
    Returns:
       4 x 4 DataFrame with cosine similarity of weather types 
       or
       dataframe with cosine similarity of weather types
        
    '''
    from sklearn.metrics.pairwise import cosine_similarity


    # unique embeddings of weather phenomena    
    embeddings = make_weather_embeddings(dataset)

    # calculate cosine similarity
    cos_sims = cosine_similarity(embeddings)

    # how many different weather types there is
    no_weather_types = len(weather_dict)

    if return_dataframe:
        return  pd.DataFrame(cos_sims, columns = list(range(1,no_weather_types + 1)), index = list(range(1,no_weather_types + 1)))
    else:
        return cos_sims

### ***4. Evaluation metrics*** ###

---

In [25]:
def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> np.float64:
    '''
    The Root Mean Squared Log Error (RMSLE) metric 

    Arguments: 
        y_true: the ground truth labels given in the dataset
        y_pred: our predictions
        
    Returns: 
        The RMSLE score
    '''

    return np.sqrt(mean_squared_log_error(y_true, y_pred))

### ***5. Pipelines and models*** ###

---

In [33]:
main_preprocess = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['humidity', 'casual', 'windspeed'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('nlp',
                        DataTransformer(vectorize_weather_nlp)),
    ('add_cos_sim',
                        DataTransformer(add_cos_sim_weather)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'weather_embeddings', 'humidity', 'windspeed', 'temp'])),
], verbose=True)

no_nlp_pipeline = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['humidity', 'casual', 'windspeed'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('add_cos_sim',
                        DataTransformer(add_cos_sim_weather)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'humidity', 'windspeed', 'temp'])),
], verbose=True)

no_nlp_pipeline_no_cos_sim = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['humidity', 'casual', 'windspeed'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'humidity', 'windspeed', 'temp'])),
], verbose=True)


In [34]:
pipelines = [
    ('main_preprocess', main_preprocess, 'Default preprocessing pipeline'),
    ('no_nlp', no_nlp_pipeline, 'No nlp'),
    ('no_nlp_no_cos_sim', no_nlp_pipeline_no_cos_sim, 'No nlp and no cos sim'),
]

In [35]:
xparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE, 'verbosity':0, 'use_label_encoder': False }
cparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE , 'silent': True}
lparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE, 'verbosity':-1, 'silent': True}
rfparams = {'random_state': RANDOM_STATE}
dparams = {'strategy': 'median'}

models = [
    ('XGBoostRegressor', xgb.XGBRegressor(**xparams)),
    ('CatBoostRregressor',  ctb.CatBoostRegressor(**cparams)),
    ('LGBMRegressor', lgbm.LGBMRegressor(**lparams)),
    ('RandomForest', RandomForestRegressor()),
    ('DummyRegressor', DummyRegressor(**dparams)),
]

### ***8. Main run*** ###

---

In [29]:
def run_experiments(pipelines, models = models, experiments_common_name = 'test-experiments',  use_neptune = True, pickle_models = False, dump_pickled_models_to_neptune = False):
    
    # empty dataframe for locally keeping track of results
    models_df = pd.DataFrame()

    # total number of all experiments
    no_experiments = len(list(product(pipelines, models)))

    print(f'Running {no_experiments} experiments')
    print(f'{"="*60}')
    
    # initialize neptune if needed
    if use_neptune:
        run = init_neptune()
    

    # median of target values 
    df_median = np.median(df_train['count']) 

    n = 1
    for p, pipeline_obj in enumerate(pipelines, 1):
        pipeline_name = pipeline_obj[0] # name of pipeline
        pipeline = pipeline_obj[1] # pipeline instance
        pipeline_comment = pipeline_obj[2] # pipeline description

        print(f'Transforming dataset using |{pipeline_name}|... (transformation {p}/{len(pipelines)})')
        print(f'{"="*60}')
        df = transform_dataset(df_train, pipeline) # transform dataset using a pipline
        X_train, X_valid, y_train, y_valid = make_X_y(df) # split dataset into train and valid
        
    
        for m, model_object in enumerate(models, 1):

            model_name = model_object[0] # custom name of a  model (like 'XGBoostRegressor')
            model = model_object[1] # model instance

            print('\n')
            print(f'Fitting... (model {m}/{len(models)})')
            print(model_name)
        
            if use_neptune:
                neptune.create_experiment(f'{experiments_common_name}-{n}') # name of experiment
            
            n += 1
            start_time = time.time() # time fitting
            model.fit(X_train, y_train) # fit the model
            end_time = time.time()

            # file pickling
            if pickle_models: 
                
                # file names
                data_file_name = f'{pipeline_name}.csv'
                model_file_name = f'{model_name}-{pipeline_name}.model'
                
                df.to_csv(data_file_name) # save dataframe locally
                
                # save model locally
                with open(model_file_name, 'wb') as f:
                    pickle.dump(model, f) # pickle a model
                
                if use_neptune and dump_pickled_models_to_neptune:
                    # if specified dump pickled files to neptune
                    neptune.log_artifact(model_file_name)
                    neptune.log_artifact(data_file_name)

            y_pred = model.predict(X_valid) # predicted values
            y_pred = [df_median if y<0 else y for y in y_pred] # no negative values

            # metrics (scores)
            score_mae = mean_absolute_error(y_valid, y_pred)
            score_mse = mean_squared_error(y_valid, y_pred)
            score_rmse = np.sqrt(score_mse) 
            score_rmsle = rmsle(y_valid, y_pred)
            score_r2 = r2_score(y_valid, y_pred)

            model_params = str(model.get_params()) # model parameters
            
            # dictionary of all variables that are supposed to be logged
            param_dict = {
                'pipeline_name': pipeline_name,
                'pipeline_steps': str(list(pipeline.named_steps.keys())),
                'pipeline_comment': pipeline_comment,
                'feats': str(X_train.columns.tolist()),
                'model': model.__class__.__name__,
                'model_params': model_params,
                'score_mae': score_mae,
                'score_mse': score_mse,
                'score_rmse': score_rmse,
                'score_rmsle': score_rmsle,
                'score_r2': score_r2,
                'time_elapsed': end_time - start_time
            }

            # log into neptune if needed
            if use_neptune:

                # make a list ['score_mae', 'score_mse', 'score_rmse', 'score_rmsle', 'score_r2', 'time_elapsed']
                score_metrics = [elem for elem in list(param_dict.keys()) if elem.startswith('score_')] + ['time_elapsed']
                
                # log values depending on their type (str or float)
                for key, value in param_dict.items():
                    if key not in score_metrics:
                        neptune.log_text(key, value)
                    else:
                        neptune.log_metric(key, value)
                
            # add row into summary dataframe for local results
            models_df = models_df.append(pd.DataFrame(param_dict, index = [0]))
    
    models_df.reset_index(drop = True, inplace = True)

    # end neptune instance
    if use_neptune:
        neptune.stop()
    
    return models_df

In [36]:
run_experiments(pipelines, models, use_neptune= False)

Running 15 experiments
Transforming dataset using |main_preprocess|... (transformation 1/3)
[Pipeline] .. (step 1 of 12) Processing make_dt_columns, total=   0.0s
[Pipeline] ... (step 2 of 12) Processing change_seasons, total=   0.0s
[Pipeline] ........ (step 3 of 12) Processing add_means, total=   0.0s
[Pipeline] ...... (step 4 of 12) Processing add_medians, total=   0.0s
[Pipeline] .... (step 5 of 12) Processing min_max_scale, total=   0.0s
[Pipeline] ..... (step 6 of 12) Processing drop_obvious, total=   0.0s
[Pipeline] .......... (step 7 of 12) Processing dummies, total=   0.0s
[Pipeline] . (step 8 of 12) Processing tokenize_weather, total=   0.1s
[Pipeline]  (step 9 of 12) Processing deconstruct_tokens, total=   2.2s
[Pipeline] ............. (step 10 of 12) Processing nlp, total=  13.7s
[Pipeline] ..... (step 11 of 12) Processing add_cos_sim, total=  11.1s
[Pipeline] .... (step 12 of 12) Processing drop_columns, total=   0.1s


Fitting... (model 1/5)
XGBoostRegressor


Fitting... 

Unnamed: 0,pipeline_name,pipeline_steps,pipeline_comment,feats,model,model_params,score_mae,score_mse,score_rmse,score_rmsle,score_r2,time_elapsed
0,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocessing pipeline,"['holiday', 'workingday', 'weather', 'atemp', ...",XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",24.865155,1857.849823,43.102782,0.396177,0.942791,4.290999
1,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocessing pipeline,"['holiday', 'workingday', 'weather', 'atemp', ...",CatBoostRegressor,"{'loss_function': 'RMSE', 'silent': True, 'max...",27.201225,2016.683588,44.9075,0.670137,0.9379,3.722847
2,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocessing pipeline,"['holiday', 'workingday', 'weather', 'atemp', ...",LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",27.922237,2102.292492,45.850763,0.586519,0.935264,1.440002
3,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocessing pipeline,"['holiday', 'workingday', 'weather', 'atemp', ...",RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",25.945756,1912.771354,43.735242,0.333867,0.9411,44.225128
4,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocessing pipeline,"['holiday', 'workingday', 'weather', 'atemp', ...",DummyRegressor,"{'constant': None, 'quantile': None, 'strategy...",138.770361,35329.691978,187.961943,1.462466,-0.087909,0.0
5,no_nlp,"['make_dt_columns', 'change_seasons', 'add_mea...",No nlp,"['holiday', 'workingday', 'weather', 'atemp', ...",XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",25.212549,1873.620403,43.285337,0.408045,0.942306,0.992067
6,no_nlp,"['make_dt_columns', 'change_seasons', 'add_mea...",No nlp,"['holiday', 'workingday', 'weather', 'atemp', ...",CatBoostRegressor,"{'loss_function': 'RMSE', 'silent': True, 'max...",26.378016,1887.264891,43.442662,0.615291,0.941885,1.576317
7,no_nlp,"['make_dt_columns', 'change_seasons', 'add_mea...",No nlp,"['holiday', 'workingday', 'weather', 'atemp', ...",LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",27.922237,2102.292492,45.850763,0.586519,0.935264,0.163003
8,no_nlp,"['make_dt_columns', 'change_seasons', 'add_mea...",No nlp,"['holiday', 'workingday', 'weather', 'atemp', ...",RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",25.744608,1880.432081,43.363949,0.329776,0.942096,3.950077
9,no_nlp,"['make_dt_columns', 'change_seasons', 'add_mea...",No nlp,"['holiday', 'workingday', 'weather', 'atemp', ...",DummyRegressor,"{'constant': None, 'quantile': None, 'strategy...",138.770361,35329.691978,187.961943,1.462466,-0.087909,0.00092
