In [1]:
import gc
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook
gc.enable()

In [2]:
BASE_DIR = os.path.abspath(os.path.curdir)
TRAIN = os.path.join(BASE_DIR, "train")

In [3]:
def read_data():
    train = pd.read_csv(os.path.join(TRAIN, 'train.csv'),
                        sep=',')
    train.sort_values(["week", "center_id", "meal_id"], ascending=True, inplace=True)
    meal_info = pd.read_csv(os.path.join(TRAIN, 'meal_info.csv'),
                            sep=',')
    fc_info = pd.read_csv(os.path.join(TRAIN, 'fulfilment_center_info.csv'),
                          sep=',')
    test = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))
    test.sort_values(["week", "center_id", "meal_id"], ascending=True, inplace=True)
    print("train records - ", train.shape)
    print("test records - ", test.shape)
    print("meal info records - ", meal_info.shape)
    print("fulfilment center records - ", fc_info.shape)
    return train, test, meal_info, fc_info


In [4]:
# Reading the dataset
train_pd, test_pd, meal_info_pd, fc_info_pd = read_data()

train records -  (456548, 9)
test records -  (32573, 8)
meal info records -  (51, 3)
fulfilment center records -  (77, 5)


In [5]:
start_week = train_pd.week.min()
end_week = test_pd.week.max()
print("Train range weeks - {} to {}".format(start_week, train_pd.week.max()))
print("Test range weeks - {} to {}".format(test_pd.week.min(), end_week))

Train range weeks - 1 to 145
Test range weeks - 146 to 155


In [6]:
week_map_pd = pd.DataFrame.from_dict({"week": np.arange(start_week, end_week + 1, 1)})

In [7]:
week_map_pd.loc[:, "month"] = week_map_pd.week.map(lambda x: np.ceil(x/4))
week_map_pd.loc[:, "quarter"] = week_map_pd.week.map(lambda x: np.ceil(x/13))

In [8]:
week_map_pd.head()

Unnamed: 0,week,month,quarter
0,1,1.0,1.0
1,2,1.0,1.0
2,3,1.0,1.0
3,4,1.0,1.0
4,5,2.0,1.0


In [9]:
train_pd.columns

Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders'],
      dtype='object')

In [10]:
def create_additional_features(data):
    data.sort_values(["week", "center_id", "meal_id"], ascending=True, inplace=True)
    data.loc[:, "base_minus_checkout"] = np.abs(data.base_price - data.checkout_price)
    data.loc[:, "cuisine_category"] = data["cuisine"] + "-" + data["category"]
    del data["category"]
    del data["cuisine"]
    center_meal_agg = data.groupby(["center_id", "meal_id"]).agg(
        {"base_minus_checkout": ["mean", "std", "max"]})
    center_meal_agg.columns = ['{}_{}'.format(col0, col1) 
                               for col0, col1 in center_meal_agg.columns.ravel()]
    center_meal_agg.reset_index(inplace=True)
    data = pd.merge(data, center_meal_agg, on=["center_id", "meal_id"], how="inner")
    data.loc[:, "base_lt_checkout"] = data["base_price"] < data["checkout_price"]    
    data.loc[:, "base_gt_checkout"] = data["base_price"] > data["checkout_price"]    
    return data

In [11]:
def correct_dtypes(data, type_dict, is_ohe=False):
    for type_ in type_dict:
        print("Processing Type - ", type_)
        for col in type_dict[type_]:
            print("processing Column - ", col)
            if type_ == "bool":
                data.loc[:, col] = data[col].astype('bool')
            elif type_ == "cat":
                if is_ohe:
                    print("Creating One Hot Encodings...")
                    data = pd.concat([data, pd.get_dummies(data[col])], axis=1)
                    del data[col]
                else:
                    print("Converting to Categorical data types...")
                    data.loc[:, col] = pd.Categorical(data[col])
    return data

In [29]:
def get_time_series_feat(data, is_test=False):
    old_length = len(data)
    data = pd.merge(data, week_map_pd, on="week", how="inner")
    new_length = len(data)
    assert (old_length == new_length), "Change in the Dataframe length after including Month & Quarter"
    if is_test:
        return data
    else:
        monthly_order_agg = data.groupby(["month", "center_id", "meal_id"]).agg({"num_orders": "mean"}).rename(
            columns={"num_orders": "num_order_monthly_mean"}).reset_index()
        quarterly_order_agg = data.groupby(["quarter", "center_id", "meal_id"]).agg({"num_orders": "mean"}).rename(
            columns={"num_orders": "num_order_quarterly_mean"}).reset_index()
        return data, monthly_order_agg, quarterly_order_agg

In [13]:
# temp = get_time_series_feat(train_pd)
# get_average(13)

In [14]:
def get_window(window, stop, start=1):
    start = start
    while start <= stop:
        if stop - start < window:
            yield (start, stop)
        else:
            yield (start, start + window)
        start = start + window


def get_average(window, start=1, stop=end_week+1):
    windows_list = get_window(window, stop, start)
    window_week_map = {}
    for counter, range_ in enumerate(windows_list):
        start, stop = range_
        window_week_map[counter + 1] = list(range(start, stop))
    return window_week_map

In [15]:
dtypes_dict = {'bool': ["emailer_for_promotion", "homepage_featured"],
               'cat': ["cuisine_category", "center_type"]}
joined_pd = pd.merge(train_pd, meal_info_pd, on="meal_id", how="left")
joined_pd = pd.merge(joined_pd, fc_info_pd, on="center_id", how="left")
joined_pd.sort_values(["week", "center_id", "meal_id"], ascending=True, inplace=True)
print("All the data merged shape - ", joined_pd.shape)
joined_pd = create_additional_features(joined_pd)

All the data merged shape -  (456548, 15)


In [16]:
joined_pd = correct_dtypes(joined_pd, dtypes_dict, is_ohe=True)

Processing Type -  bool
processing Column -  emailer_for_promotion
processing Column -  homepage_featured
Processing Type -  cat
processing Column -  cuisine_category
Creating One Hot Encodings...
processing Column -  center_type
Creating One Hot Encodings...


In [17]:
joined_pd.shape

(456548, 38)

In [20]:
joined_pd.head().T

Unnamed: 0,0,1,2,3,4
id,1436842,1205013,1447751,1014968,1003563
week,1,2,3,4,5
center_id,10,10,10,10,10
meal_id,1062,1062,1062,1062,1062
checkout_price,181.39,183.36,184.36,182.36,183.39
base_price,181.39,182.36,182.36,183.36,181.39
emailer_for_promotion,False,False,False,False,False
homepage_featured,False,False,False,False,False
num_orders,865,782,851,1202,958
city_code,590,590,590,590,590


In [22]:
joined_pd, month_order_agg, quarter_order_agg = get_time_series_feat(joined_pd)
joined_pd = pd.merge(joined_pd, month_order_agg, on=["month", "center_id", "meal_id"], how="left")
joined_pd = pd.merge(joined_pd, quarter_order_agg, on=["quarter", "center_id", "meal_id"], how="left")

In [23]:
joined_pd.shape

(456548, 42)

In [25]:
joined_pd.head().T

Unnamed: 0,0,1,2,3,4
id,1436842,1142165,1421626,1260561,1411421
week,1,1,1,1,1
center_id,10,10,10,10,10
meal_id,1062,1109,1198,1207,1216
checkout_price,181.39,231.86,129.04,322.07,455.93
base_price,181.39,276.48,196,382.18,454.93
emailer_for_promotion,False,True,False,False,False
homepage_featured,False,False,False,False,False
num_orders,865,2672,269,769,54
city_code,590,590,590,590,590


In [26]:
# print(joined_pd.month.value_counts(dropna=False))
# print(joined_pd.quarter.value_counts(dropna=False))

In [27]:
joined_test = pd.merge(test_pd, meal_info_pd, on="meal_id", how="left")
joined_test = pd.merge(joined_test, fc_info_pd, on="center_id", how="left")
joined_test.sort_values(["week", "center_id", "meal_id"], ascending=True, inplace=True)
print("All the TEST data merged shape - ", joined_test.shape)

All the TEST data merged shape -  (32573, 14)


In [28]:
joined_test = create_additional_features(joined_test)
joined_test = correct_dtypes(joined_test, dtypes_dict, is_ohe=True)
joined_test.shape

Processing Type -  bool
processing Column -  emailer_for_promotion
processing Column -  homepage_featured
Processing Type -  cat
processing Column -  cuisine_category
Creating One Hot Encodings...
processing Column -  center_type
Creating One Hot Encodings...


(32573, 37)

In [30]:
joined_test = get_time_series_feat(joined_test, is_test=True)

In [31]:
joined_test = pd.merge(joined_test, month_order_agg, on=["month", "center_id", "meal_id"], how="left")
joined_test = pd.merge(joined_test, quarter_order_agg, on=["quarter", "center_id", "meal_id"], how="left")

In [32]:
joined_test.shape

(32573, 41)

In [34]:
def train_val_index(data, val_weeks=10):
    val_start = data.week.max() - val_weeks
    data.reset_index(drop=True, inplace=True)
    train_index = data[data.week <= val_start].index.tolist()
    val_index = data[data.week > val_start].index.tolist()
    return train_index, val_index

In [35]:
train_records, val_records = train_val_index(joined_pd)

In [36]:
len(train_records), len(val_records)

(423727, 32821)

In [37]:
def data_transform(data_pd, rem_cols, is_test=False):
    if is_test:
        remove_cols = rem_cols
        predictor = None
    else:
        remove_cols = rem_cols + ["num_orders"]
        # ["id", 'week', "base_price", "center_id", "meal_id"]
        predictor = "num_orders"

    req_cols = []
    for col in data_pd.columns:
        if col not in remove_cols:
            req_cols.append(col)
    print("columns used for prediction - ", req_cols)
    dataset1 = data_pd[req_cols]

    if is_test:
        target1 = None
    else:
        target1 = data_pd[predictor].values
        print("Target shape - ", target1.shape)
    print("Input data shape - ", dataset1.shape)
    return dataset1, target1

In [38]:
joined_pd.shape

(456548, 42)

In [39]:
joined_pd.isnull().sum()[joined_pd.isnull().sum() > 0]

base_minus_checkout_std    3
dtype: int64

In [40]:
joined_pd.base_minus_checkout_std.fillna(0.0, inplace=True)

In [41]:
# joined_pd[(joined_pd["region_code"] == 85) & 
#           (joined_pd["Continental-Fish"] == 1) & 
#           (joined_pd["week"] == 85) & 
#           (joined_pd["TYPE_C"] == 1)]["base_minus_checkout_std"]

In [32]:
# joined_pd[joined_pd.base_minus_checkout_std.isnull()].T

In [43]:
joined_test.base_minus_checkout_std.fillna(0.0, inplace=True)

In [44]:
joined_test.isnull().sum()[joined_test.isnull().sum() > 0]

num_order_monthly_mean      23246
num_order_quarterly_mean      425
dtype: int64

In [35]:
# meal_info_pd.head()

In [36]:
# month_meal = pd.merge(month_order_agg, meal_info_pd, on=["meal_id"], how="inner")
# month_meal = pd.merge(month_meal, fc_info_pd, on=["center_id"], how="inner")

In [37]:
# quarter_meal = pd.merge(month_order_agg, meal_info_pd, on=["meal_id"], how="inner")
# month_meal = pd.merge(month_meal, fc_info_pd, on=["center_id"], how="inner")

In [38]:
# joined_test[joined_test.num_order_monthly_mean.isnull()].T

In [39]:
joined_pd.columns

Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders', 'city_code',
       'region_code', 'op_area', 'base_minus_checkout',
       'base_minus_checkout_mean', 'base_minus_checkout_std',
       'base_minus_checkout_max', 'base_lt_checkout', 'base_gt_checkout',
       'Continental-Beverages', 'Continental-Fish', 'Continental-Pizza',
       'Continental-Seafood', 'Indian-Beverages', 'Indian-Biryani',
       'Indian-Desert', 'Indian-Rice Bowl', 'Italian-Beverages',
       'Italian-Pasta', 'Italian-Salad', 'Italian-Sandwich', 'Thai-Beverages',
       'Thai-Extras', 'Thai-Other Snacks', 'Thai-Soup', 'Thai-Starters',
       'TYPE_A', 'TYPE_B', 'TYPE_C', 'month', 'quarter',
       'num_order_monthly_mean', 'num_order_quarterly_mean'],
      dtype='object')

In [40]:
cols_to_remove = ['id', 'week', 'center_id', 'meal_id', 'num_order_monthly_mean', 'num_order_quarterly_mean']

In [41]:
train_set, train_target = data_transform(joined_pd.copy(), rem_cols=cols_to_remove)
test_set, _ = data_transform(joined_test.copy(), rem_cols=cols_to_remove, is_test=True)

columns used for prediction -  ['checkout_price', 'base_price', 'emailer_for_promotion', 'homepage_featured', 'city_code', 'region_code', 'op_area', 'base_minus_checkout', 'base_minus_checkout_mean', 'base_minus_checkout_std', 'base_minus_checkout_max', 'base_lt_checkout', 'base_gt_checkout', 'Continental-Beverages', 'Continental-Fish', 'Continental-Pizza', 'Continental-Seafood', 'Indian-Beverages', 'Indian-Biryani', 'Indian-Desert', 'Indian-Rice Bowl', 'Italian-Beverages', 'Italian-Pasta', 'Italian-Salad', 'Italian-Sandwich', 'Thai-Beverages', 'Thai-Extras', 'Thai-Other Snacks', 'Thai-Soup', 'Thai-Starters', 'TYPE_A', 'TYPE_B', 'TYPE_C', 'month', 'quarter']
Target shape -  (456548,)
Input data shape -  (456548, 35)
columns used for prediction -  ['checkout_price', 'base_price', 'emailer_for_promotion', 'homepage_featured', 'city_code', 'region_code', 'op_area', 'base_minus_checkout', 'base_minus_checkout_mean', 'base_minus_checkout_std', 'base_minus_checkout_max', 'base_lt_checkout', 

In [42]:
from sklearn.model_selection import ShuffleSplit

shuffle = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2019)
train_index, test_index = list(shuffle.split(train_set))[2]
print("Train - Val split | ", len(train_index), " - ", len(test_index))

Train - Val split |  365238  -  91310


In [43]:
# splitting train and test on weeks, val on lasty 10 weeks
X_train = train_set.iloc[train_records].values
X_val = train_set.iloc[val_records].values
y_train = train_target[train_records]
y_val = train_target[val_records]

X_test = test_set.values

print("TRAIN SHAPE || data - {} | target - {}".format(X_train.shape,
                                                      y_train.shape))
print("VAL SHAPE || data - {} | target - {}".format(X_val.shape,
                                                    y_val.shape))
print("TEST SHAPE || data - {} ".format(X_test.shape))

TRAIN SHAPE || data - (423727, 35) | target - (423727,)
VAL SHAPE || data - (32821, 35) | target - (32821,)
TEST SHAPE || data - (32573, 35) 


In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def rms_log_error(true, pred):
    return np.sqrt(mean_squared_error(true, pred))


def model_results(model):
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    print("Train In-sample metric : ", rms_log_error(y_train,
                                                     y_train_pred))
    print("Validation metric : ", rms_log_error(y_val,
                                                y_val_pred))
    
    print("R-Square values || TRAIN - {} || VAL - {}".format(model.score(X_train, y_train), 
                                                             model.score(X_val, y_val)))


In [45]:
rf_config = {"n_estimators": 200, 
             "max_depth": 8,
             "random_state": 111, "n_jobs": 4,"verbose": 1}
rf = RandomForestRegressor(**rf_config)
rf.fit(X_train, y_train)
model_results(rf)
gc.collect()

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   29.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


Train In-sample metric :  215.10472930989474
Validation metric :  177.05798606624418


18

In [46]:
rf.score(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.7s finished


0.7159095263834951

In [47]:
rf.score(X_val, y_val)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


0.5902239667556687

In [51]:
rf_config = {"n_estimators": 300, 
             "max_depth": 8,
             "random_state": 111, "n_jobs": 4}
rf = RandomForestRegressor(**rf_config)
rf.fit(X_train, y_train)
model_results(rf)
gc.collect()

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   28.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  3.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.3s finished


Train In-sample metric :  215.2485837122776
Validation metric :  176.95605423354874


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


R-Square values || TRAIN - 0.7155294200896225 || VAL - 0.590695644986891


[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.3s finished


12

In [52]:
rf_config = {"n_estimators": 500, 
             "max_depth": 8,
             "random_state": 111, "n_jobs": 4}
rf = RandomForestRegressor(**rf_config)
rf.fit(X_train, y_train)
model_results(rf)
gc.collect()

Train In-sample metric :  215.36754843107454
Validation metric :  177.02309083864606
R-Square values || TRAIN - 0.7152148878038522 || VAL - 0.5903854711214442


30

In [75]:
# Train In-sample metric :  166.52392347627202
# Validation metric :  163.69684683292664
rf_config = {"n_estimators": 1000,
             "max_depth": 8,
             "min_samples_split": 0.003,
             "max_features": 'sqrt',
             "random_state": 111,
             "n_jobs": 4,
             "verbose": 1}

rf = RandomForestRegressor(**rf_config)

rf.fit(X_train, y_train)

model_results(rf)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   44.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   12.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Paral

Train In-sample metric :  166.52392347627202
Validation metric :  163.69684683292664


[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.9s finished


In [None]:
# Train In-sample metric :  105.15903034359839
# Validation metric :  136.502376993227
rf_config = {"n_estimators": 800,
             "max_depth": 20,
             "max_features": 'sqrt',
             "random_state": 111,
             "n_jobs": 4,
             "verbose": 1}

rf = RandomForestRegressor(**rf_config)

rf.fit(X_train, y_train)

model_results(rf)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  3.4min


In [None]:
rf_config = {"n_estimators": 1500,
             "max_depth": 15,
             "max_features": 'log2',
             "min_samples_split": 0.003, 
             "random_state": 2019,
             "n_jobs": 4,
             "verbose": 1}

rf = RandomForestRegressor(**rf_config)

rf.fit(X_train, y_train)

model_results(rf)

In [87]:
# max_depth = 12
# Train In-sample metric :  197.20422421330088
# Validation metric :  188.3555629074162
rf_config = {"n_estimators": 1500,
             "max_depth": 10,
             "max_features": 'log2',
             "min_samples_split": 0.003, 
             "random_state": 2019,
             "n_jobs": 4,
             "verbose": 1}

rf = RandomForestRegressor(**rf_config)

rf.fit(X_train, y_train)

model_results(rf)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   49.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  5.7min
[Parallel(n_jobs=4)]: Done 1500 out of 1500 | elapsed:  7.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:   21.8s
[Parallel(n_jobs=4)]: Done 1500 out of 1500 | elapsed:   26.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Par

Train In-sample metric :  198.0867908210644
Validation metric :  189.17593860992582


[Parallel(n_jobs=4)]: Done 1500 out of 1500 | elapsed:    6.5s finished


In [79]:
rf_config = {"n_estimators": 700,
             "max_depth": 12,
             "max_features": 'sqrt',
             "random_state": 111,
             "n_jobs": 4,
             "verbose": 1}

rf = RandomForestRegressor(**rf_config)

rf.fit(X_train, y_train)

model_results(rf)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:   14.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s


Train In-sample metric :  128.43748620593917
Validation metric :  143.0822366528206


[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    3.5s finished
