### ***Main info*** ###

***Project description:*** https://www.kaggle.com/competitions/bike-sharing-demand/overview/description

***Project goal:*** combine historical usage patterns with weather data in order to forecast bike rental demand in the Capital Bikeshare program in Washington, D.C.

***Notebook goal:*** make a final set of pipelines to transform the dataset and run them with predifined models in neptune

***Suggested evaluation metric:*** Root Mean Squared Logarithmic Error (RMSLE)

***Other used evalutaion metrics*** Mean Absolute Error(MAE), Mean Squared Error(MSE), Root Mean Squared Error(RMSE), R Squared (R2)

***Comment*** This notebook uses previous notebooks: <u>01a_regression_pipelines_first_test</u>, <u>01b_regression_pipelines_improved</u> and <u>01c_bonus_meteo_equations_for_atemp</u> in order to build a final  solution for the presented regression problem.
<p>

### ***0. Project preparation*** ###

---

In [None]:
# main upgrades
!pip install --upgrade neptune-client
!pip install --upgrade neptune

In [2]:
# main imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn import set_config
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

import catboost as ctb
import lightgbm as lgbm
import xgboost as xgb

import eli5
import time
import warnings
import json
from tqdm import tqdm
import neptune
from itertools import product
from collections import Counter
import re
import pickle

# minor settings
plt.style.use('ggplot')
%matplotlib inline
set_config(display='diagram')
warnings.filterwarnings('ignore', category=DeprecationWarning)

# global variables
RANDOM_STATE = 0
OUTPUT_DIR = 'outputs/'
MODELS_DIR = 'models/'

In [3]:
# version check
def show_version(module_object: object, n: int = 20) -> str:
    '''
    Check version of different libraries
    '''
    module_name = getattr(module_object, '__name__')
    module_ver = getattr(module_object, '__version__')
    dots = '.' * (n - len(module_name))
    
    print (f'{module_name}{dots}{module_ver}')


!python --version
module_list = [np, pd, sklearn, eli5, neptune, ctb, lgbm, xgb]
for module in module_list:
    show_version(module)

Python 3.7.5
numpy...............1.20.0
pandas..............1.2.4
sklearn.............0.24.2
eli5................0.11.0
neptune.............0.15.2
catboost............0.25.1
lightgbm............3.0.0
xgboost.............1.3.2


In [45]:
# run neptune server
def init_neptune(credentials_file = 'neptune_credentials.json'):
    '''
    Initialize neptune project
    '''
    with open(credentials_file) as f:
        neptune_credentials = json.load(f)

    run = neptune.init(
        api_token = neptune_credentials['API_TOKEN'],
        project_qualified_name = neptune_credentials['PROJECT']
    )
    return run

### ***1. Load data*** ###

---

In [46]:
df_train = pd.read_csv('inputs/train.csv')

In [47]:
weather_dict = {1: ['Clear', 'Few clouds', 'Partly cloudy'],
2: ['Mist + Cloudy', 'Mist + Broken clouds', 'Mist + Few clouds', 'Mist'],
3: ['Light Snow', 'Light Rain + Thunderstorm + Scattered clouds', 'Light Rain + Scattered clouds'],
4: ['Heavy Rain + Ice Pallets + Thunderstorm + Mist', 'Snow + Fog']}


### ***2. Custom data classes*** ###

---

In [48]:
class DataTransformer():
    '''
    Change the initial dataset into a new one based
    on passed function
    '''
    # copy parameter introduced to prevent SettingwithCopyWarning
    # https://www.dataquest.io/blog/settingwithcopywarning/
    def __init__(self, func, copy = True, **kwargs):
        self.func = func
        self.copy = copy

    def transform(self, input_df, **transform_params):
        input_df_ = input_df if not self.copy else input_df.copy()
        return self.func(input_df_,)

    def fit(self, X, y=None, **fit_params):
        return self

In [49]:
class ColumnSelector():
    '''
    Return a dataframe with predefined columns only
    '''

    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [50]:
class ColumnDroper():
    '''
    Return a dataframe without selected columns
    '''
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

### ***3. Custom feature functions*** ###

---

#### ***3.1. Add or change functions*** ####

#### ***3.1.1. Numerical features functions*** ####

##### 3.1.1.1 Main transformation functions #####

In [51]:
def transform_columns(dataset: pd.DataFrame, feats: list, transformer: sklearn.preprocessing._data) -> pd.DataFrame:
    '''
    Use a transformer to transform selected columns
    Examples of transformers: OrdinalEncoder, MinMaxScaler, Normalizer, StandardScaler
        Function used in pipeline step.

    Arguments:
        dataset: pandas DataFrame
        
    Returns:
        dataframe with transformed columns
    '''
    dataset[feats] = transformer.fit_transform(dataset[feats])
    return dataset


In [52]:
def transform_dataset(dataset: pd.DataFrame, pipeline: Pipeline) -> pd.DataFrame:
    '''
    Return a dataset after pipeline transformations.
        Function used in main run step.

    Arguments:
        dataset: pandas DataFrame

    Returns: 
        dataset after pipeline transformation
    '''
    return pipeline.fit_transform(dataset)

##### 3.1.1.2 Other transformation functions ####

In [53]:
def make_cols_from_datetime(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Convert a column to datetime format and create new columns: 
    'year', 'month', 'day', 'hour'
    
    Arguments:
        dataset: pandas DataFrame 
    
    Returns:
        dataset: transformed pandas DataFrame
    '''

    # convert string to datetime type
    dataset['datetime'] = pd.to_datetime(dataset['datetime'])

    # make new columns from datetime column
    dataset['year'] = dataset['datetime'].dt.year
    dataset['month'] = dataset['datetime'].dt.month
    dataset['day'] = dataset['datetime'].dt.day
    dataset['hour'] = dataset['datetime'].dt.hour
    dataset['dayofweek'] = dataset['datetime'].dt.dayofweek
    dataset['weekend'] = dataset['dayofweek'].map(lambda x: int(x in [6,7]))
    return dataset

In [54]:
def seasons_change(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Set proper season duration and change their representation according
    to dataset legend

    Argument:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''

    changes = [
    ('2011-01-01', '2011-03-19', 4),
    ('2011-03-20', '2011-06-20', 1),
    ('2011-06-21', '2011-09-22', 2),
    ('2011-09-23', '2011-12-20', 3),
    ('2011-12-21', '2012-03-19', 4),
    ('2012-03-20', '2012-06-19', 1),
    ('2012-06-20', '2012-09-21', 2),
    ('2012-09-22', '2012-12-20', 3),
    ('2012-12-21', '2012-12-31', 4),
     ]

    for (start_date, end_date, new_season) in changes:
        dataset.loc[between_dates(dataset, start_date, '00', end_date, '23').index,'season'] = new_season
    return dataset

In [55]:
def generate_agg_features(dataset: pd.DataFrame, agg_name = np.median) -> pd.DataFrame:
    '''
    Calculate a monthly agg_function (like mean, median...) and add it to the dataframe

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: DataFrame with an additional column which contains the agg_function
        of bike shares in a month
    '''
    agg_dataset = dataset[['month', 'year', 'count']].groupby(['month', 'year']).agg(agg_name)
    agg_dataset = agg_dataset.reset_index()
    agg_dataset = agg_dataset.rename(columns = {'count':str(agg_name.__name__)})
    return pd.merge(dataset, agg_dataset, on=['month', 'year'], how = 'left')

In [56]:
def min_max_normalize_feats(dataset: pd.DataFrame, feats: list = []) -> pd.DataFrame:
    '''
    Normalize selected features. Equivalent of MinMaxScaler

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''
    for feature_name in feats:
        if feature_name in dataset.columns.tolist():
            max_value = dataset[feature_name].max()
            min_value = dataset[feature_name].min()
            dataset[feature_name] = (dataset[feature_name] - min_value) / (max_value - min_value)
    return dataset

In [57]:
def correct_y_pred(y_pred: np.array) -> np.array:
    '''
    Correct y_pred values co they don't contain any negative values
        Function used in main run step.

    Arguments:
        y_pred: np.array
    
    Returns:
        y_pred: array wiht no negative values; every negative value
        is replaced with median
    '''
    df_median = np.median(df_train['count']) 
    y_pred = [df_median if y<0 else y for y in y_pred] 
    return y_pred

#### ***3.1.2. Text (nlp) features functions*** ####

In [58]:
def tokenize_weather(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Change integers in 'weather' column (1, 2, 3, ...) into a list of weather
    phenomena

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        initial dataframe with one extra column (weather_phenomena)
    '''
    
    # split weather phenomena into individual elements
    def weather_to_indicidual(x):
        x = ','.join([elem for elem in x])
        x = re.split(r'[,\+]', x)

        return list(set([elem.strip() for elem in x]))

    # change integers in 'weather' column  into list of individual weather phenomena
    dataset['weather_phenomena'] = dataset['weather'].map(weather_dict).apply(lambda x: weather_to_indicidual(x))

    return dataset


In [59]:
def deconstruct_weather_tokens(dataset: pd.DataFrame, token_col_name: str = 'weather_phenomena') -> pd.DataFrame:
    '''
    One-hot-encode individual weather phenomena and add them to initial dataframe

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        initial dataframe with an extra columns containing 0 or 1; each column
        represents an individual weather phenomena
    '''

    # make a list of all individual meteorological phenomena 
    all_phenomena = list(dataset[token_col_name].explode().unique())

    def return_cols_with_individual_phenomena(phenomena: pd.Series) -> pd.DataFrame:
        '''
        Build a dataframe (X) with all_phenomena as columns and put 0 or 1 in 
        rows representing the presence or absence of a particular phenomena
        '''

        def phenomenon_in_phenomena(phenomena: pd.Series) -> list:
            return [int(phenomenon in phenomena) for phenomenon in all_phenomena]
    
        X = phenomena.map(phenomenon_in_phenomena).apply(pd.Series)
        X.columns = all_phenomena
        return X 
    
    X = return_cols_with_individual_phenomena(dataset['weather_phenomena'])

    # concatenate columns to original dataset
    return pd.concat([dataset, X], axis=1)

In [60]:
def make_weather_embeddings(dataset: pd.DataFrame) -> np.array:
    '''
    Return unique vector representations of the weather column
        Function needed for vectorize_weather_nlp function
    
    Arguments: 
        dataset: pandas DataFrame
    
    Returns:
        array of 4 lists (4 types of weather), each list containing 384 elements
    '''
    from sentence_transformers import SentenceTransformer
    
    # randomly chosen pretrained model
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


    # column of unique meteo phenomena in a 'str' form
    uni_meteo_weather = dataset['weather_phenomena'].map(lambda x: ' '.join(x)).unique() 

    # vector representatnions of 'uni_meteo_weather'
    embeddings = model.encode(uni_meteo_weather, convert_to_tensor = False)
    return embeddings

In [61]:
def vectorize_weather_nlp(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Return vector representations of the weather column

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset with 384 more columns containing vector representation of sentences
        (meteorological phenomena contatenated into a string are treated as sentences)
    '''

    # unique integers (1, 2, 3, 4) repesenting the weather
    uni_num_weather = dataset['weather'].unique()
    
    # unique embeddings of weather phenomena
    embeddings = make_weather_embeddings(dataset)

    # dictionary for pandas mapping
    embeddings_dict = dict(zip(uni_num_weather, embeddings))

    # one additional column containing a 384-element list in each cell
    dataset['weather_embeddings'] = dataset['weather'].map(embeddings_dict)

    # split 'weather_embeddings' into 384 individual columns and concat them to the original dataset
    dataset = pd.concat([dataset, pd.DataFrame(dataset['weather_embeddings'].tolist())], axis = 1)
    return dataset

In [62]:
def add_cos_sim_weather(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Calculate cosinus similarity between 4 weather types and concat
    4 additional columns to the original dataset containing calculated
    similarity

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        original dataset with 4 extra columns representing cosine similarity 
        between weather types
    '''

    # construct 4x4 array of cosine similarity
    cos_sims_array = peek_weather_cos_sim(dataset, return_dataframe= False)

    # dictionary with cos_sims for pandas mapping 
    # {1: [1.000000 0.897642 0.728108 0.566511], 2: [...], 3: [...], 4: [...]}
    cos_sims_dict = dict(zip(list(range(1,5)), cos_sims_array))

    # map dictionary to pandas DataFrame as a new column
    dataset['cos_sims'] = dataset['weather'].map(cos_sims_dict)

    # deconstruct mapped column to 4 individual columns
    cos_sims_new_columns = pd.DataFrame(dataset['cos_sims'].tolist(), columns = [f'cos_sim_weather_{n}' for n in list(range(1, 5))])

    # delete not needed column
    del dataset['cos_sims']

    return pd.concat([dataset, cos_sims_new_columns], axis = 1)

#### ***3.2. Select functions*** ####

In [63]:
def between_dates(dataset: pd.DataFrame, start_date: str, start_time: str, end_date: str, end_time: str) -> pd.DataFrame:
    '''
    Show dataframe between two dates and timestamps
    
    Arguments:
        dataset: pandas DataFrame 
        start_date: date which the dataset must be trimmed from
        start_time: hour of day from start_date
        end_date: date which the dataset must be trimmed to
        end_time: hour of day from end_date

    Returns:
        A DataFrame between (start_date, start_time) and (end_date, end_time)
    '''

    start_dt = f'{start_date} {start_time}:00:00'
    end_dt = f'{end_date} {end_time}:00:00'
    mask = (dataset['datetime'] >= start_dt) & (dataset['datetime'] <= end_dt)
    return dataset[mask]

In [64]:
def drop_obvious(dataset: pd.DataFrame) -> pd.DataFrame:
    '''
    Return a dataset withouth some obviously unneeded columns

    Arguments:
        dataset: pandas DataFrame

    Returns:
        dataset: pandas DataFrame
    '''
    black_list = ['Unnamed: 0', 'datetime', 'casual', 'registered']
    
    feats = [feat for feat in dataset.columns.tolist() if feat not in black_list]

    return dataset[feats]

In [65]:
def select_dtypes(dataset: pd.DataFrame, dtypes = np.number) -> pd.DataFrame:
    '''
    Return a dataset with a specific datatype

    Arguments:
        dataset: pandas DataFrame
    
    Returns:
        dataset: pandas DataFrame
    '''
    return dataset.select_dtypes(include=dtypes)


In [66]:
def make_X_y(dataset: pd.DataFrame, test_size: int = 0.3):
    '''
    Split dataframe info train and valid set
    
    Arguments:
        dataset: pandas DataFrame
        test_size: test size split
    
    Returns:
        two DataFrames and two DataSeries containing independent and target variables
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(dataset.drop('count', axis = 1), dataset['count'], test_size = test_size, random_state = RANDOM_STATE)
    return X_train, X_valid, y_train, y_valid

In [67]:
def top_results_files(models_df: pd.DataFrame, topn: int = 5, metric: str = 'score_rmsle', lower_is_better: bool = True) -> list :
    '''
    Return a tuple containing file names with best n models and datasets

    Arguments:
        models_df: dataframe of models after main run loop
        topn: how many top models are being returned
        metric: evaluation metric which is being taken into account when choosing top models
        lower_is_better: specify is a lower value of metric means a better model performence
    
    Returns:
        list of tuples; each tuple consist of a csv file name and a file model name
        example: [(dataset1.csv, xgboost.model), (dataset1.csv, catboost.model), (dataset2.csv, xgboost.model)]
    '''
    if lower_is_better:
        best_models = models_df.sort_values(by='score_rmsle', ascending= True).head(topn)
    else:
        best_models =models_df.sort_values(by='score_rmsle', ascending= False).head(topn)
    
    return [(f'{pipeline_name}.csv', f'{model_name}-{pipeline_name}.model') for pipeline_name, model_name in zip(best_models['pipeline_name'], best_models['model_custom_name'])]


In [68]:
def top_results_df(models_df: pd.DataFrame, topn: int = 5, metric: str = 'score_rmsle', lower_is_better: bool = True) -> pd.DataFrame:
    '''
    Return a dataframe with best n experiments

    Arguments:
        models_df: dataframe of models after main run loop
        topn: how many top models are being returned
        metric: evaluation metric which is being taken into account when choosing top models
        lower_is_better: specify is a lower value of metric means a better model performence
    
    Returns:
        Dataframe trimmed to only  topn experiments base on a dataframe generated
        with the run_experiments function
    '''
    if lower_is_better:
        best_models = models_df.sort_values(by='score_rmsle', ascending= True).head(topn)
    else:
        best_models =models_df.sort_values(by='score_rmsle', ascending= False).head(topn)
    return best_models

#### ***3.3. Additional functions*** ####

In [69]:
def peek_weather_cos_sim(dataset: pd.DataFrame, return_dataframe = True):
    '''
    Show cosine similarity between different types of weather for a transformed dataset
        Function needed for add_cos_sim_weather function
    
    Arguments: 
        dataset: pandas DataFrame (transformed dataset containing 'weather_phenomena' column)
    
    Returns:
       4 x 4 DataFrame with cosine similarity of weather types 
       or
       dataframe with cosine similarity of weather types
        
    '''
    from sklearn.metrics.pairwise import cosine_similarity


    # unique embeddings of weather phenomena    
    embeddings = make_weather_embeddings(dataset)

    # calculate cosine similarity
    cos_sims = cosine_similarity(embeddings)

    # how many different weather types there is
    no_weather_types = len(weather_dict)

    if return_dataframe:
        return  pd.DataFrame(cos_sims, columns = list(range(1,no_weather_types + 1)), index = list(range(1,no_weather_types + 1)))
    else:
        return cos_sims

In [70]:
def run_single(dataset: pd.DataFrame, pipeline: Pipeline, model, feats: list, metric, show_feature_importance: bool = True) -> tuple:
    '''
    Make a single prediction for a single case. This is needed when we want to quickly:
    1) transform 2) split into train and valid 3) fit 4) predict 5) score
    a dataset and a model

    If a pipeline is specified the data will be transformed according to this pipeline, otherwise
    an already prepared dataset must be provided as input.

    If feats are provided, the dataset is restricted to only those feats. We can skip specifying
    the feets if a pipeline already transformed the data properly to just the feats we want.

    Arguments:
        dataset: raw or transformed dataset
        pipeline: pipeline to transform the dataset (optional)
        model: specified model for fitting the dataset
        feats: features to be included in a dataset (optional)
        metric: metric for scoring
        show_feature_importance: show or hide eli5 feature importande
    Returns:
        transformed dataset, model predict score
    '''
    def show_weights(model, return_weights_dataframe = True):
        if return_weights_dataframe:
            return pd.read_html(eli5.show_weights(model, feature_names = X_train.columns.tolist()).data)[0]
        else:
                return pd.read_html(eli5.show_weights(model, feature_names = X_train.columns.tolist()).data)[0]['Feature'].tolist()


    if pipeline is not None:
        dataset = transform_dataset(dataset, pipeline)

    if feats is not None:
        X_train, X_valid, y_train, y_valid = make_X_y(dataset[feats])
    else:
        X_train, X_valid, y_train, y_valid = make_X_y(dataset)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid) 
    y_pred = correct_y_pred(y_pred)

    score = metric(y_valid, y_pred)
    if show_feature_importance:
        show_weights(model)
    return dataset, score


In [71]:
def save_top_results_files(filename: str, models_df):
    '''
    Save datasets names and models names to a *.npy file

    Arguments:
        filename: specified file name 
        models_df: dataset with summary of all models performence
    '''
    np.save(filename, np.array(top_results_files(models_df)))

### ***4. Evaluation metrics*** ###

---

In [72]:
def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> np.float64:
    '''
    The Root Mean Squared Log Error (RMSLE) metric 

    Arguments: 
        y_true: the ground truth labels given in the dataset
        y_pred: our predictions
        
    Returns: 
        The RMSLE score
    '''

    return np.sqrt(mean_squared_log_error(y_true, y_pred))

### ***5. Pipelines and models*** ###

---

In [73]:
basic_pipeline = Pipeline(steps = [
    ('drop obvious', DataTransformer(drop_obvious))
], verbose= True)

In [74]:
main_preprocess = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['atemp'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('nlp',
                        DataTransformer(vectorize_weather_nlp)),
    ('add_cos_sim',
                        DataTransformer(add_cos_sim_weather)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'weather_embeddings', 'humidity', 'windspeed', 'temp'])),
], verbose=True)


In [75]:
no_nlp_pipeline = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['atemp'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('add_cos_sim',
                        DataTransformer(add_cos_sim_weather)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'humidity', 'windspeed', 'temp'])),
], verbose=True)

In [76]:
no_nlp_pipeline_no_cos_sim = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('min_max_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['atemp'], MinMaxScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season']))),
    ('tokenize_weather',
                        DataTransformer(tokenize_weather)),
    ('deconstruct_tokens',
                        DataTransformer(deconstruct_weather_tokens)),
    ('drop_columns', 
                        ColumnDroper(['weather_phenomena', 'humidity', 'windspeed', 'temp'])),
], verbose=True)

In [77]:
more_dummies_different_scaler = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('standard_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['atemp'], StandardScaler()))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season', 'weather']))),
    ('drop_columns', 
                        ColumnDroper(['humidity', 'windspeed', 'temp'])),
], verbose=True)

In [78]:
no_drop_humidity_windspeed_temp = Pipeline(steps = [
    ('make_dt_columns', 
                        DataTransformer(make_cols_from_datetime)),
    ('change_seasons', 
                        DataTransformer(seasons_change)),
    ('add_means', 
                        DataTransformer(lambda df: generate_agg_features(df, np.mean))),
    ('add_medians', 
                        DataTransformer(lambda df: generate_agg_features(df, np.median))),
    ('standard_scale', 
                        DataTransformer(lambda df: transform_columns(df, ['atemp', 'temp', 'windspeed', 'humidity'], StandardScaler()))),
    ('dummies', 
                        DataTransformer(lambda df: pd.get_dummies(df, columns = ['season', 'weather']))),
    ('drop_obvious', 
                        DataTransformer(drop_obvious)),
], verbose=True)

In [79]:
xparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE, 'verbosity':0, 'use_label_encoder': False }
xparams_more_n = {'n_estimators': 200, 'max_depth': 10, 'random_state': RANDOM_STATE, 'verbosity':0, 'use_label_encoder': False }
cparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE , 'silent': True}
lparams = {'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE, 'verbosity':-1, 'silent': True}
lparams_more_depth = {'n_estimators': 100, 'max_depth': 20, 'random_state': RANDOM_STATE, 'verbosity':-1, 'silent': True}
rfparams = {'random_state': RANDOM_STATE}
dparams = {'strategy': 'median'}

In [80]:
pipelines = [
    ('basic_pipeline', basic_pipeline, 'No transfomations (other then essential drop columns'),
    ('main_preprocess', main_preprocess, 'Default preprocess pipeline with nlp and with "humidity", "casual", "windspeed" columns dropped'),
    ('no_nlp', no_nlp_pipeline, 'Same as default pipeline but with no nlp functions with "humidity", "casual", "windspeed" columns dropped'),
    ('no_nlp_no_cos_sim', no_nlp_pipeline_no_cos_sim, 'Same as deafult pipeline but with no nlp and no cos_sim with "humidity", "casual", "windspeed" columns dropped'),
    ('more_dummies_different_scaler_no_nlp', more_dummies_different_scaler, 'No nlp, no cos sim, dummies include: season and weather with "humidity", "casual", "windspeed" columns dropped'),
    ('more_dummies_different_scaler_no_nlp_keep_3variables', no_drop_humidity_windspeed_temp, 'No nlp, no cos sim, dummies include: season and weather; "humidity", "casual", "windspeed" not dropped but scaled'),
]

In [81]:
models = [
    ('XGBoostRegressor_100_estim', xgb.XGBRegressor(**xparams)),
    ('XGBoostRegressor_200_estim', xgb.XGBRegressor(**xparams_more_n)),
    ('CatBoostRregressor',  ctb.CatBoostRegressor(**cparams)),
    ('LGBMRegressor_max_depth_10', lgbm.LGBMRegressor(**lparams)),
    ('LGBMRegressor_max_depth_20', lgbm.LGBMRegressor(**lparams_more_depth)),
    ('RandomForest', RandomForestRegressor()),
    ('DummyRegressor', DummyRegressor(**dparams)),
]

### ***6. Main run*** ###

---

`IMPORTANT`

Use `dump_pickled_models_to_neptune` with caution. It can send hundreds of MB of pickled models to neptune server.

In [84]:
def run_experiments(pipelines, models = models, experiments_common_name = 'test-experiments',  use_neptune = True, pickle_models = False, dump_pickled_models_to_neptune = False):
    
    # empty dataframe for locally keeping track of results
    models_df = pd.DataFrame()

    # total number of all experiments
    no_experiments = len(list(product(pipelines, models)))

    print(f'Running {no_experiments} experiments')
    print(f'{"="*60}')
    
    # initialize neptune if needed
    if use_neptune:
        run = init_neptune()
    

    e = 1 # first experiment number
    for p, pipeline_obj in enumerate(pipelines, 1):
        pipeline_name = pipeline_obj[0] # name of pipeline
        pipeline = pipeline_obj[1] # pipeline instance
        pipeline_comment = pipeline_obj[2] # pipeline description

        print(f'Transforming dataset using |{pipeline_name}|... (transformation {p}/{len(pipelines)})')
        print(f'{"="*60}')

        # Data transformation
        df = transform_dataset(df_train, pipeline) # transform dataset using a pipline
        X_train, X_valid, y_train, y_valid = make_X_y(df) # split dataset into train and valid
        
        # Model application on transformed data 
        for m, model_object in enumerate(models, 1):

            model_name = model_object[0] # custom name of a  model (like 'XGBoostRegressor')
            model = model_object[1] # model instance

            print('\n')
            print(f'Fitting... (model {m}/{len(models)})')
            print(model_name)
        
            if use_neptune:
                neptune.create_experiment(f'{experiments_common_name}-{e}') # name of experiment
            
            e += 1
            start_time = time.time() # time fitting
            model.fit(X_train, y_train) # fit the model
            end_time = time.time()

            # file pickling
            if pickle_models: 
                
                # file names
                data_file_name = f'{pipeline_name}.csv'
                model_file_name = f'{model_name}-{pipeline_name}.model'

                if 'Unnamed: 0' in df.columns: del df['Unnamed: 0']
                df.to_csv(f'{OUTPUT_DIR}{data_file_name}') # save dataframe locally
                
                # save model locally
                with open(f'{MODELS_DIR}{model_file_name}', 'wb') as f:
                    pickle.dump(model, f) # pickle a model
                
                if use_neptune and dump_pickled_models_to_neptune:
                    # if specified dump pickled files to neptune
                    neptune.log_artifact(model_file_name)
                    neptune.log_artifact(data_file_name)

            y_pred = model.predict(X_valid) # predicted values
            y_pred = correct_y_pred(y_pred)

            # metrics (scores)
            score_mae = mean_absolute_error(y_valid, y_pred)
            score_mse = mean_squared_error(y_valid, y_pred)
            score_rmse = np.sqrt(score_mse) 
            score_rmsle = rmsle(y_valid, y_pred)
            score_r2 = r2_score(y_valid, y_pred)

            model_params = str(model.get_params()) # model parameters
            
            # dictionary of all variables that are supposed to be logged
            param_dict = {
                'pipeline_name': pipeline_name,
                'pipeline_steps': str(list(pipeline.named_steps.keys())),
                'pipeline_comment': pipeline_comment,
                'feats': str(X_train.columns.tolist()),
                'n_feats': len(X_train.columns.tolist()), ### added
                'fs_method': 'no_selection',
                'fs_param': '',
                'fs_param_value': 0,
                'model': model.__class__.__name__,
                'model_custom_name': model_name,
                'model_params': model_params,
                'score_mae': score_mae,
                'score_mse': score_mse,
                'score_rmse': score_rmse,
                'score_rmsle': score_rmsle,
                'score_r2': score_r2,
                'time_elapsed': end_time - start_time
            }

            # log into neptune if needed
            if use_neptune:
                score_metrics = [elem for elem in list(param_dict.keys()) if elem.startswith('score_')] + ['time_elapsed', 'n_feats', 'fs_param_value']
                
                # log values depending on their type (str or float)
                for key, value in param_dict.items():
                    if key not in score_metrics:
                        neptune.log_text(key, value)
                    else:
                        neptune.log_metric(key, value)
                
            # add row into summary dataframe for local results
            models_df = models_df.append(pd.DataFrame(param_dict, index = [0]))
    
    models_df.reset_index(drop = True, inplace = True)

    # end neptune instance
    if use_neptune:
        neptune.stop()
    
    return models_df

In [None]:
models_df = run_experiments(pipelines, models, use_neptune= False, pickle_models= True)

In [44]:
models_df

Unnamed: 0,pipeline_name,pipeline_steps,pipeline_comment,feats,n_feats,fs_method,fs_param,fs_param_value,model,model_custom_name,model_params,score_mae,score_mse,score_rmse,score_rmsle,score_r2,time_elapsed
0,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,XGBRegressor,XGBoostRegressor_100_estim,"{'objective': 'reg:squarederror', 'base_score'...",117.568922,27413.487615,165.570189,1.30057,0.155855,2.077066
1,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,XGBRegressor,XGBoostRegressor_200_estim,"{'objective': 'reg:squarederror', 'base_score'...",120.200671,28794.651284,169.689868,1.310784,0.113325,2.688413
2,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,CatBoostRegressor,CatBoostRregressor,"{'loss_function': 'RMSE', 'silent': True, 'max...",107.201296,21331.481809,146.05301,1.276259,0.343139,1.250663
3,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,LGBMRegressor,LGBMRegressor_max_depth_10,"{'boosting_type': 'gbdt', 'class_weight': None...",107.972832,21391.046882,146.256784,1.288039,0.341305,0.556958
4,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,LGBMRegressor,LGBMRegressor_max_depth_20,"{'boosting_type': 'gbdt', 'class_weight': None...",107.718162,21386.646607,146.24174,1.287986,0.34144,0.220453
5,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,RandomForestRegressor,RandomForest,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",109.58028,23374.62336,152.887617,1.243114,0.280224,2.71323
6,basic_pipeline,['drop obvious'],No transfomations (other then essential drop c...,"['season', 'holiday', 'workingday', 'weather',...",8,no_selection,,,DummyRegressor,DummyRegressor,"{'constant': None, 'quantile': None, 'strategy...",138.770361,35329.691978,187.961943,1.462466,-0.087909,0.0
7,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocess pipeline with nlp and with ...,"['holiday', 'workingday', 'weather', 'atemp', ...",418,no_selection,,,XGBRegressor,XGBoostRegressor_100_estim,"{'objective': 'reg:squarederror', 'base_score'...",24.869353,1857.934551,43.103765,0.396175,0.942789,10.84007
8,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocess pipeline with nlp and with ...,"['holiday', 'workingday', 'weather', 'atemp', ...",418,no_selection,,,XGBRegressor,XGBoostRegressor_200_estim,"{'objective': 'reg:squarederror', 'base_score'...",25.013816,1871.048207,43.255615,0.404771,0.942385,26.722949
9,main_preprocess,"['make_dt_columns', 'change_seasons', 'add_mea...",Default preprocess pipeline with nlp and with ...,"['holiday', 'workingday', 'weather', 'atemp', ...",418,no_selection,,,CatBoostRegressor,CatBoostRregressor,"{'loss_function': 'RMSE', 'silent': True, 'max...",27.160699,1995.790765,44.674274,0.648352,0.938544,4.404709
