In [120]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib as plp
import lightgbm as lgb

In [92]:
df_train = pd.read_csv(
    '../Data/train_set_long.csv', usecols=[1, 2, 3, 4],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [93]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [94]:
df_val = pd.read_csv(
    '../Data/train_set_short.csv', usecols=[1, 2, 3, 4],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [123]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [95]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [96]:
df_val = df_val.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [97]:
df_train.columns = df_train.columns.get_level_values(1)
df_val.columns = df_val.columns.get_level_values(1)

In [98]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [105]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [106]:
print("Preparing dataset...")
t2017 = date(2017, 6, 20)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [129]:
X_train.shape

(618140, 2)

In [108]:
df_train.shape

(154535, 57)

In [130]:
y_train.shape

(618140, 16)

In [111]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

Training and predicting models...


In [113]:
def prepare_val_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_val, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_val, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_val[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [115]:
X_val, y_val = prepare_val_dataset(date(2017,6,8))

In [141]:
df_g = pd.read_csv(
    "../Data/valid_set.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
)

In [142]:
df_g = df_g.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_g.columns = df_g.columns.get_level_values(1)

In [143]:
def prepare_g_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_g, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_g, t2017, 7, 7).mean(axis=1).values,
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [148]:
X_test = prepare_g_dataset(date(2017, 8, 16), is_train=False)

In [151]:
MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Step 1
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.401534	valid_1's l2: 0.39887
[100]	training's l2: 0.396613	valid_1's l2: 0.399033
Early stopping, best iteration is:
[62]	training's l2: 0.398262	valid_1's l2: 0.398025
mean_7_2017: 1734482.91
mean_3_2017: 1462235.85
Step 2
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.415356	valid_1's l2: 0.438594
[100]	training's l2: 0.409987	valid_1's l2: 0.432872
[150]	training's l2: 0.409243	valid_1's l2: 0.431943
[200]	training's l2: 0.408647	valid_1's l2: 0.431242
[250]	training's l2: 0.408138	valid_1's l2: 0.430644
[300]	training's l2: 0.407695	valid_1's l2: 0.430124
[350]	training's l2: 0.407303	valid_1's l2: 0.429676
[400]	training's l2: 0.406958	valid_1's l2: 0.429282
[450]	training's l2: 0.406652	valid_1's l2: 0.428947
[500]	training's l2: 0.406382	valid_1's l2: 0.42867
[550]	training's l2: 0.406134	valid_1's l2: 0.428421
[600]	training's l2: 0.405916	valid_1's l2: 0.428204
[650]	training's l2: 0.405723	valid_1's l2: 0.428026
[700]	training's l2: 0.405547	valid_1's l2: 0.427869
[750]	training's l2: 0.405387	valid_1's l2: 0.427747
[800]	training's l2: 0.405249	valid_1's l2: 0.427643
[850]	training's l2: 0.405124	valid_1's l2: 0.427559
[900]	training's l2: 0.405008	valid_1's l2: 0.427466
[950]	training's l2: 0.404907	valid_1's l2: 0.42



[50]	training's l2: 0.417812	valid_1's l2: 0.561624
[100]	training's l2: 0.4132	valid_1's l2: 0.54132
[150]	training's l2: 0.412463	valid_1's l2: 0.537189
[200]	training's l2: 0.411876	valid_1's l2: 0.534586
[250]	training's l2: 0.411374	valid_1's l2: 0.53212
[300]	training's l2: 0.410947	valid_1's l2: 0.529898
[350]	training's l2: 0.410574	valid_1's l2: 0.527797
[400]	training's l2: 0.410244	valid_1's l2: 0.526
[450]	training's l2: 0.409956	valid_1's l2: 0.52405
[500]	training's l2: 0.409701	valid_1's l2: 0.522428
[550]	training's l2: 0.409472	valid_1's l2: 0.520836
[600]	training's l2: 0.409271	valid_1's l2: 0.51931
[650]	training's l2: 0.40909	valid_1's l2: 0.518039
[700]	training's l2: 0.408929	valid_1's l2: 0.516725
[750]	training's l2: 0.408785	valid_1's l2: 0.515526
[800]	training's l2: 0.408654	valid_1's l2: 0.514428
[850]	training's l2: 0.408541	valid_1's l2: 0.513307
[900]	training's l2: 0.408435	valid_1's l2: 0.512333
[950]	training's l2: 0.40834	valid_1's l2: 0.511322
[1000



[50]	training's l2: 0.450831	valid_1's l2: 0.607413
[100]	training's l2: 0.445344	valid_1's l2: 0.585465
[150]	training's l2: 0.444352	valid_1's l2: 0.580797
[200]	training's l2: 0.443555	valid_1's l2: 0.577427
[250]	training's l2: 0.442872	valid_1's l2: 0.574458
[300]	training's l2: 0.442283	valid_1's l2: 0.571611
[350]	training's l2: 0.44177	valid_1's l2: 0.569089
[400]	training's l2: 0.441315	valid_1's l2: 0.566775
[450]	training's l2: 0.440915	valid_1's l2: 0.564525
[500]	training's l2: 0.44056	valid_1's l2: 0.562529
[550]	training's l2: 0.440237	valid_1's l2: 0.560553
[600]	training's l2: 0.439956	valid_1's l2: 0.558782
[650]	training's l2: 0.439704	valid_1's l2: 0.557113
[700]	training's l2: 0.439473	valid_1's l2: 0.555526
[750]	training's l2: 0.439269	valid_1's l2: 0.554166
[800]	training's l2: 0.439085	valid_1's l2: 0.552826
[850]	training's l2: 0.43892	valid_1's l2: 0.55147
[900]	training's l2: 0.438773	valid_1's l2: 0.550184
[950]	training's l2: 0.438639	valid_1's l2: 0.54912



[50]	training's l2: 0.466842	valid_1's l2: 0.476848
[100]	training's l2: 0.461166	valid_1's l2: 0.480138
Early stopping, best iteration is:
[52]	training's l2: 0.465903	valid_1's l2: 0.476784
mean_7_2017: 1939387.87
mean_3_2017: 1626904.30
Step 6
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.493442	valid_1's l2: 0.519423
Early stopping, best iteration is:
[42]	training's l2: 0.500114	valid_1's l2: 0.518182
mean_7_2017: 2015955.54
mean_3_2017: 1767131.24
Step 7




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.467606	valid_1's l2: 0.46863
[100]	training's l2: 0.462531	valid_1's l2: 0.464898
[150]	training's l2: 0.461923	valid_1's l2: 0.4648
[200]	training's l2: 0.461453	valid_1's l2: 0.464758
Early stopping, best iteration is:
[186]	training's l2: 0.461576	valid_1's l2: 0.464729
mean_7_2017: 1712882.59
mean_3_2017: 1398913.80
Step 8
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.470131	valid_1's l2: 0.458676
[100]	training's l2: 0.465193	valid_1's l2: 0.458113
Early stopping, best iteration is:
[65]	training's l2: 0.466644	valid_1's l2: 0.457705
mean_7_2017: 1575093.86
mean_3_2017: 1254822.93
Step 9
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.474263	valid_1's l2: 0.519496
[100]	training's l2: 0.468874	valid_1's l2: 0.51297
[150]	training's l2: 0.46782	valid_1's l2: 0.511144
[200]	training's l2: 0.466955	valid_1's l2: 0.509622
[250]	training's l2: 0.466211	valid_1's l2: 0.508282
[300]	training's l2: 0.46555	valid_1's l2: 0.507078
[350]	training's l2: 0.464974	valid_1's l2: 0.506026
[400]	training's l2: 0.464455	valid_1's l2: 0.505081
[450]	training's l2: 0.463994	valid_1's l2: 0.504193
[500]	training's l2: 0.463585	valid_1's l2: 0.503399
[550]	training's l2: 0.463217	valid_1's l2: 0.502712
[600]	training's l2: 0.462881	valid_1's l2: 0.50205
[650]	training's l2: 0.462588	valid_1's l2: 0.501506
[700]	training's l2: 0.462319	valid_1's l2: 0.500989
[750]	training's l2: 0.462075	valid_1's l2: 0.500516
[800]	training's l2: 0.461852	valid_1's l2: 0.500068
[850]	training's l2: 0.461656	valid_1's l2: 0.499685
[900]	training's l2: 0.461479	valid_1's l2: 0.499365
[950]	training's l2: 0.461323	valid_1's l2: 0.49905



[50]	training's l2: 0.463616	valid_1's l2: 0.694723
[100]	training's l2: 0.458944	valid_1's l2: 0.67443
[150]	training's l2: 0.457998	valid_1's l2: 0.669223
[200]	training's l2: 0.457215	valid_1's l2: 0.665172
[250]	training's l2: 0.456545	valid_1's l2: 0.661656
[300]	training's l2: 0.455957	valid_1's l2: 0.658249
[350]	training's l2: 0.455447	valid_1's l2: 0.655255
[400]	training's l2: 0.454989	valid_1's l2: 0.652468
[450]	training's l2: 0.454583	valid_1's l2: 0.649677
[500]	training's l2: 0.454223	valid_1's l2: 0.647212
[550]	training's l2: 0.453898	valid_1's l2: 0.644995
[600]	training's l2: 0.453608	valid_1's l2: 0.642856
[650]	training's l2: 0.453352	valid_1's l2: 0.640832
[700]	training's l2: 0.453117	valid_1's l2: 0.638818
[750]	training's l2: 0.452905	valid_1's l2: 0.637132
[800]	training's l2: 0.452718	valid_1's l2: 0.635476
[850]	training's l2: 0.452548	valid_1's l2: 0.633878
[900]	training's l2: 0.452395	valid_1's l2: 0.632306
[950]	training's l2: 0.452258	valid_1's l2: 0.63



[50]	training's l2: 0.494761	valid_1's l2: 0.562912
[100]	training's l2: 0.489328	valid_1's l2: 0.551244
[150]	training's l2: 0.488178	valid_1's l2: 0.54797
[200]	training's l2: 0.487228	valid_1's l2: 0.545314
[250]	training's l2: 0.486418	valid_1's l2: 0.542933
[300]	training's l2: 0.485707	valid_1's l2: 0.540763
[350]	training's l2: 0.485085	valid_1's l2: 0.53882
[400]	training's l2: 0.48453	valid_1's l2: 0.537002
[450]	training's l2: 0.484035	valid_1's l2: 0.535314
[500]	training's l2: 0.483596	valid_1's l2: 0.533803
[550]	training's l2: 0.483207	valid_1's l2: 0.532464
[600]	training's l2: 0.482857	valid_1's l2: 0.531167
[650]	training's l2: 0.482547	valid_1's l2: 0.529917
[700]	training's l2: 0.482262	valid_1's l2: 0.528883
[750]	training's l2: 0.482009	valid_1's l2: 0.527869
[800]	training's l2: 0.481774	valid_1's l2: 0.526888
[850]	training's l2: 0.481571	valid_1's l2: 0.526013
[900]	training's l2: 0.481383	valid_1's l2: 0.525189
[950]	training's l2: 0.481215	valid_1's l2: 0.5244



[50]	training's l2: 0.515477	valid_1's l2: 0.509306
[100]	training's l2: 0.510125	valid_1's l2: 0.510853
Early stopping, best iteration is:
[58]	training's l2: 0.512768	valid_1's l2: 0.508927
mean_7_2017: 1814159.00
mean_3_2017: 1512064.56
Step 13
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.539444	valid_1's l2: 0.533573
Early stopping, best iteration is:
[46]	training's l2: 0.541957	valid_1's l2: 0.53333
mean_7_2017: 1897345.92
mean_3_2017: 1655501.19
Step 14
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.507569	valid_1's l2: 0.495679
[100]	training's l2: 0.502717	valid_1's l2: 0.492535
[150]	training's l2: 0.502134	valid_1's l2: 0.492364
[200]	training's l2: 0.501683	valid_1's l2: 0.492207
[250]	training's l2: 0.501307	valid_1's l2: 0.492173
[300]	training's l2: 0.500983	valid_1's l2: 0.492135
Early stopping, best iteration is:
[272]	training's l2: 0.501159	valid_1's l2: 0.49212
mean_7_2017: 1631321.90
mean_3_2017: 1334732.91
Step 15
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.505244	valid_1's l2: 0.482926
[100]	training's l2: 0.5005	valid_1's l2: 0.48498
Early stopping, best iteration is:
[53]	training's l2: 0.504072	valid_1's l2: 0.482844
mean_7_2017: 1489872.57
mean_3_2017: 1191324.36
Step 16
Training until validation scores don't improve for 50 rounds.




[50]	training's l2: 0.507488	valid_1's l2: 0.528305
[100]	training's l2: 0.502389	valid_1's l2: 0.525155
[150]	training's l2: 0.50139	valid_1's l2: 0.524025
[200]	training's l2: 0.500558	valid_1's l2: 0.522936
[250]	training's l2: 0.499842	valid_1's l2: 0.52197
[300]	training's l2: 0.499212	valid_1's l2: 0.521111
[350]	training's l2: 0.498658	valid_1's l2: 0.520391
[400]	training's l2: 0.498158	valid_1's l2: 0.519759
[450]	training's l2: 0.497707	valid_1's l2: 0.519182
[500]	training's l2: 0.497309	valid_1's l2: 0.518654
[550]	training's l2: 0.49695	valid_1's l2: 0.518221
[600]	training's l2: 0.49663	valid_1's l2: 0.51783
[650]	training's l2: 0.496342	valid_1's l2: 0.517498
[700]	training's l2: 0.49608	valid_1's l2: 0.517182
[750]	training's l2: 0.495841	valid_1's l2: 0.516936
[800]	training's l2: 0.495622	valid_1's l2: 0.516694
[850]	training's l2: 0.495432	valid_1's l2: 0.516491
[900]	training's l2: 0.495257	valid_1's l2: 0.516341
[950]	training's l2: 0.495104	valid_1's l2: 0.516202


In [162]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_g.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('simple_lgbm.csv', float_format='%.4f', index=None)

Making submission...


In [1]:
items.shape

NameError: name 'items' is not defined