In [2]:
"""LGBM Starter

This is watered-down version of one of my earlier scripts. 
Only very basic features are retained so hopefully it won't ruin the fun for you.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

df_train = pd.read_csv(
    '../Data/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()


promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
    })
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

# print("Training and predicting models...")
# params = {
#     'num_leaves': 2**5 - 1,
#     'objective': 'regression_l2',
#     'max_depth': 8,
#     'min_data_in_leaf': 50,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.75,
#     'bagging_fraction': 0.75,
#     'bagging_freq': 1,
#     'metric': 'l2',
#     'num_threads': 4
# }

# MAX_ROUNDS = 1000
# val_pred = []
# test_pred = []
# cate_vars = []
# for i in range(16):
#     print("=" * 50)
#     print("Step %d" % (i+1))
#     print("=" * 50)
#     dtrain = lgb.Dataset(
#         X_train, label=y_train[:, i],
#         categorical_feature=cate_vars,
#         weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
#     )
#     dval = lgb.Dataset(
#         X_val, label=y_val[:, i], reference=dtrain,
#         weight=items["perishable"] * 0.25 + 1,
#         categorical_feature=cate_vars)
#     bst = lgb.train(
#         params, dtrain, num_boost_round=MAX_ROUNDS,
#         valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
#     )
#     print("\n".join(("%s: %.2f" % x) for x in sorted(
#         zip(X_train.columns, bst.feature_importance("gain")),
#         key=lambda x: x[1], reverse=True
#     )))
#     val_pred.append(bst.predict(
#         X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
#     test_pred.append(bst.predict(
#         X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

# print("Validation mse:", mean_squared_error(
#     y_val, np.array(val_pred).transpose()))

# print("Making submission...")
# y_test = np.array(test_pred).transpose()
# df_preds = pd.DataFrame(
#     y_test, index=df_2017.index,
#     columns=pd.date_range("2017-08-16", periods=16)
# ).stack().to_frame("unit_sales")
# df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

# submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
# submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
# submission.to_csv('nopromo.csv', float_format='%.4f', index=None)


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Preparing dataset...


In [7]:
X_train.shape

(627160, 3)

In [6]:
weight.shape

(156790,)

In [11]:
items.shape

(4100, 3)

In [9]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [10]:
items.shape

(4100, 3)

In [12]:
items = items.reindex(df_2017.index.get_level_values(1))

In [13]:
items.shape

(156790, 3)

In [None]:
pd.concat([items["perishable"]] * 4) * 0.25 + 1 # Adding up the some columns 4 times

In [14]:
x = pd.concat([items["perishable"]] * 4)

In [15]:
x.shape

(627160,)