In [21]:
import pandas as pd
import numpy as np
from datetime import datetime
import tqdm
import re
from sklearn.multioutput import MultiOutputRegressor
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import simplefilter

import matplotlib.pyplot as plt
import lightgbm as lgb

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

In [22]:
# Set Matplotlib defaults
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'


def plot_multistep(y, every=1, ax=None, palette_kwargs=None):
    palette_kwargs_ = dict(palette='husl', n_colors=16, desat=None)
    if palette_kwargs is not None:
        palette_kwargs_.update(palette_kwargs)
    palette = sns.color_palette(**palette_kwargs_)
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_prop_cycle(plt.cycler('color', palette))
    for date, preds in y[::every].iterrows():
        preds.index = pd.period_range(start=date, periods=len(preds))
        preds.plot(ax=ax)
    return ax

In [23]:
groups = [['HOBBIES', 'CA_1'],
       ['HOUSEHOLD', 'CA_1'],
       ['FOODS', 'CA_1'],
       ['HOBBIES', 'CA_2'],
       ['HOUSEHOLD', 'CA_2'],
       ['FOODS', 'CA_2'],
       ['HOBBIES', 'CA_3'],
       ['HOUSEHOLD', 'CA_3'],
       ['FOODS', 'CA_3'],
       ['HOBBIES', 'CA_4'],
       ['HOUSEHOLD', 'CA_4'],
       ['FOODS', 'CA_4'],
       ['HOBBIES', 'TX_1'],
       ['HOUSEHOLD', 'TX_1'],
       ['FOODS', 'TX_1'],
       ['HOBBIES', 'TX_2'],
       ['HOUSEHOLD', 'TX_2'],
       ['FOODS', 'TX_2'],
       ['HOBBIES', 'TX_3'],
       ['HOUSEHOLD', 'TX_3'],
       ['FOODS', 'TX_3'],
       ['HOBBIES', 'WI_1'],
       ['HOUSEHOLD', 'WI_1'],
       ['FOODS', 'WI_1'],
       ['HOBBIES', 'WI_2'],
       ['HOUSEHOLD', 'WI_2'],
       ['FOODS', 'WI_2'],
       ['HOBBIES', 'WI_3'],
       ['HOUSEHOLD', 'WI_3'],
       ['FOODS', 'WI_3']]

In [27]:
lgb_params = {
    #"boosting_type": "goss",
    "n_estimators": 1000,
    "boosting_type": "gbdt",
    "objective": "tweedie",
    "tweedie_variance_power": 1.1,
    "metric": "rmse",
    "learning_rate": 0.01,
    #"num_leaves": 2 ** 5 - 1,
    #"min_data_in_leaf": 2 ** 12 - 1,
    "feature_fraction": 0.5,
    #"max_bin": 100,
    "boost_from_average": False,
    #"num_boost_round": 1400,
    "verbose": -1,
    #"num_threads": os.cpu_count(),
    "force_row_wise": True,
    "seed": 42
}

In [28]:
%%time
all_subs = pd.DataFrame()
for cat_id, store_id in groups:
    feature_df = pd.read_csv(f'../data/cat_and_store_data/{cat_id}_and_{store_id}_features.csv')
    target_df = pd.read_csv(f'../data/cat_and_store_data/{cat_id}_and_{store_id}_target.csv')
    
    price_cols = [col for col in feature_df.columns if 'price' in col]
    
    X_columns = feature_df.columns.drop('id').drop(price_cols)
    
    #X_columns = feature_df.columns.drop('id')
    
    X = feature_df[feature_df.trend<=1913][X_columns]
    y = target_df
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    model = MultiOutputRegressor(lgb.LGBMRegressor(**lgb_params))
    model.fit(X, y)

    #y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
    #y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
    
    #train_rmse = mean_squared_error(y_train, y_fit, squared=False)
    #test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    #print((f"Train RMSE: {train_rmse:.2f}\n" f"Test RMSE: {test_rmse:.2f}"))
    
    y_fore = pd.DataFrame(model.predict(feature_df[feature_df.trend==1941][X_columns]), columns=target_df.columns)
    y_fore['id'] = feature_df[feature_df.trend==1941].reset_index().id
    
    y_valid = pd.DataFrame(model.predict(feature_df[feature_df.trend==1913][X_columns]), columns=target_df.columns)
    y_valid['id']  = feature_df[feature_df.trend==1941].reset_index().id
    y_valid['id'] = y_valid['id'].apply(lambda x: x.replace('_evaluation', '_validation'))
    
    all_sub = pd.concat([y_valid, y_fore], axis=0)
    all_sub.to_csv(f'../result/cat_and_store/lgb_{cat_id}_{store_id}_all_submission.csv', index=False)
    
    all_subs = pd.concat([all_sub, all_subs], axis=0)
    
    print(f'{cat_id} and {store_id} done')

HOBBIES and CA_1 done
HOUSEHOLD and CA_1 done
FOODS and CA_1 done
HOBBIES and CA_2 done
HOUSEHOLD and CA_2 done
FOODS and CA_2 done
HOBBIES and CA_3 done
HOUSEHOLD and CA_3 done
FOODS and CA_3 done
HOBBIES and CA_4 done
HOUSEHOLD and CA_4 done
FOODS and CA_4 done
HOBBIES and TX_1 done
HOUSEHOLD and TX_1 done
FOODS and TX_1 done
HOBBIES and TX_2 done
HOUSEHOLD and TX_2 done
FOODS and TX_2 done
HOBBIES and TX_3 done
HOUSEHOLD and TX_3 done
FOODS and TX_3 done
HOBBIES and WI_1 done
HOUSEHOLD and WI_1 done
FOODS and WI_1 done
HOBBIES and WI_2 done
HOUSEHOLD and WI_2 done
FOODS and WI_2 done
HOBBIES and WI_3 done
HOUSEHOLD and WI_3 done
FOODS and WI_3 done
CPU times: user 2d 16h 30min 13s, sys: 6h 1min, total: 2d 22h 31min 13s
Wall time: 12h 8min 59s


In [29]:
all_subs.to_csv(f'../result/lgb_cat_and_store_all_submission.csv', index=False)