In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib as plp
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [3]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [5]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [6]:
items = items.reindex(df_train.index.get_level_values(1))

In [7]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [8]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [9]:
print("Preparing dataset...")
t2017 = date(2017, 5, 16)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [10]:
X_val, y_val = prepare_dataset(date(2017, 7, 23))

In [11]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [12]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))



Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.368974	valid_1's l2: 0.444153
[100]	training's l2: 0.363409	valid_1's l2: 0.428594
[150]	training's l2: 0.362881	valid_1's l2: 0.427605
[200]	training's l2: 0.362513	valid_1's l2: 0.427377
[250]	training's l2: 0.362192	valid_1's l2: 0.427341
[300]	training's l2: 0.361897	valid_1's l2: 0.427318
[350]	training's l2: 0.361621	valid_1's l2: 0.42725
[400]	training's l2: 0.36137	valid_1's l2: 0.427153
[450]	training's l2: 0.361134	valid_1's l2: 0.427036
[500]	training's l2: 0.360911	valid_1's l2: 0.427055
[550]	training's l2: 0.3607	valid_1's l2: 0.426976
[600]	training's l2: 0.360476	valid_1's l2: 0.427016
Early stopping, best iteration is:
[563]	training's l2: 0.360637	valid_1's l2: 0.426958
mean_7_2017: 3807970.38
mean_14_2017: 2626975.89
mean_16_2017: 825629.61
mean_3_2017: 800933.95
Step 2




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.381349	valid_1's l2: 0.384926
[100]	training's l2: 0.375689	valid_1's l2: 0.378575
[150]	training's l2: 0.375077	valid_1's l2: 0.378182
[200]	training's l2: 0.374649	valid_1's l2: 0.37797
[250]	training's l2: 0.374294	valid_1's l2: 0.37787
[300]	training's l2: 0.373964	valid_1's l2: 0.377774
[350]	training's l2: 0.373665	valid_1's l2: 0.377685
[400]	training's l2: 0.373392	valid_1's l2: 0.377648
[450]	training's l2: 0.373133	valid_1's l2: 0.377606
[500]	training's l2: 0.372897	valid_1's l2: 0.377587
[550]	training's l2: 0.372663	valid_1's l2: 0.377565
Early stopping, best iteration is:
[540]	training's l2: 0.37271	valid_1's l2: 0.37755
mean_14_2017: 4127619.80
mean_7_2017: 3198956.93
mean_16_2017: 765655.46
mean_3_2017: 136652.06
Step 3




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.384958	valid_1's l2: 0.387765
[100]	training's l2: 0.380261	valid_1's l2: 0.378805
[150]	training's l2: 0.37975	valid_1's l2: 0.378174
[200]	training's l2: 0.379356	valid_1's l2: 0.378003
[250]	training's l2: 0.379028	valid_1's l2: 0.37794
[300]	training's l2: 0.378706	valid_1's l2: 0.377895
[350]	training's l2: 0.378434	valid_1's l2: 0.377876
[400]	training's l2: 0.37818	valid_1's l2: 0.377868
[450]	training's l2: 0.377954	valid_1's l2: 0.377844
[500]	training's l2: 0.377728	valid_1's l2: 0.37786
Early stopping, best iteration is:
[452]	training's l2: 0.377945	valid_1's l2: 0.377838
mean_14_2017: 4306384.66
mean_7_2017: 1994618.61
mean_16_2017: 522730.10
mean_3_2017: 88194.58
Step 4




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.418273	valid_1's l2: 0.382595
[100]	training's l2: 0.412792	valid_1's l2: 0.378603
[150]	training's l2: 0.41221	valid_1's l2: 0.378677
Early stopping, best iteration is:
[111]	training's l2: 0.412647	valid_1's l2: 0.378594
mean_14_2017: 5149418.74
mean_7_2017: 2210092.77
mean_16_2017: 613802.88
mean_3_2017: 62018.05
Step 5




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.423518	valid_1's l2: 0.468129
Early stopping, best iteration is:
[43]	training's l2: 0.429117	valid_1's l2: 0.467558
mean_16_2017: 4234686.73
mean_14_2017: 3533538.23
mean_7_2017: 723908.22
mean_3_2017: 685620.86
Step 6




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.447672	valid_1's l2: 0.451184
[100]	training's l2: 0.440302	valid_1's l2: 0.450174
Early stopping, best iteration is:
[62]	training's l2: 0.443156	valid_1's l2: 0.449948
mean_16_2017: 7181691.75
mean_3_2017: 1947370.09
mean_14_2017: 747747.10
mean_7_2017: 47133.66
Step 7




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.422603	valid_1's l2: 0.481494
[100]	training's l2: 0.417462	valid_1's l2: 0.470208
[150]	training's l2: 0.416957	valid_1's l2: 0.469704
[200]	training's l2: 0.416588	valid_1's l2: 0.469681
[250]	training's l2: 0.416245	valid_1's l2: 0.469566
[300]	training's l2: 0.415927	valid_1's l2: 0.469585
Early stopping, best iteration is:
[264]	training's l2: 0.416148	valid_1's l2: 0.469522
mean_16_2017: 5107342.67
mean_7_2017: 1281470.66
mean_14_2017: 1099066.60
mean_3_2017: 376471.02
Step 8




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.424692	valid_1's l2: 0.550568
[100]	training's l2: 0.419836	valid_1's l2: 0.535266
[150]	training's l2: 0.419342	valid_1's l2: 0.534293
[200]	training's l2: 0.418936	valid_1's l2: 0.53408
[250]	training's l2: 0.418581	valid_1's l2: 0.533909
[300]	training's l2: 0.418252	valid_1's l2: 0.5338
[350]	training's l2: 0.417945	valid_1's l2: 0.533734
[400]	training's l2: 0.417659	valid_1's l2: 0.533679
[450]	training's l2: 0.417392	valid_1's l2: 0.533568
Early stopping, best iteration is:
[447]	training's l2: 0.417409	valid_1's l2: 0.533538
mean_14_2017: 3425437.19
mean_16_2017: 2287314.95
mean_7_2017: 1539500.60
mean_3_2017: 101127.70
Step 9




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.429267	valid_1's l2: 0.456086
[100]	training's l2: 0.424204	valid_1's l2: 0.447393
[150]	training's l2: 0.423633	valid_1's l2: 0.446748
[200]	training's l2: 0.42321	valid_1's l2: 0.446558
[250]	training's l2: 0.422843	valid_1's l2: 0.446456
[300]	training's l2: 0.422508	valid_1's l2: 0.446415
[350]	training's l2: 0.422196	valid_1's l2: 0.446366
[400]	training's l2: 0.421911	valid_1's l2: 0.446348
[450]	training's l2: 0.42162	valid_1's l2: 0.446299
Early stopping, best iteration is:
[419]	training's l2: 0.421799	valid_1's l2: 0.446284
mean_14_2017: 4158176.36
mean_16_2017: 1762261.04
mean_7_2017: 1668213.70
mean_3_2017: 45008.26
Step 10




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.42151	valid_1's l2: 0.555233
[100]	training's l2: 0.417027	valid_1's l2: 0.539552
[150]	training's l2: 0.416486	valid_1's l2: 0.538239
[200]	training's l2: 0.416039	valid_1's l2: 0.537804
[250]	training's l2: 0.41566	valid_1's l2: 0.537534
[300]	training's l2: 0.415305	valid_1's l2: 0.537264
[350]	training's l2: 0.41498	valid_1's l2: 0.537196
[400]	training's l2: 0.414694	valid_1's l2: 0.537102
[450]	training's l2: 0.414417	valid_1's l2: 0.537128
Early stopping, best iteration is:
[414]	training's l2: 0.414616	valid_1's l2: 0.536992
mean_14_2017: 3020910.41
mean_16_2017: 2282570.14
mean_7_2017: 1224623.57
mean_3_2017: 39787.12
Step 11




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.456733	valid_1's l2: 0.479223
[100]	training's l2: 0.45151	valid_1's l2: 0.468554
[150]	training's l2: 0.450879	valid_1's l2: 0.467857
[200]	training's l2: 0.450412	valid_1's l2: 0.467703
[250]	training's l2: 0.449977	valid_1's l2: 0.46767
Early stopping, best iteration is:
[235]	training's l2: 0.450103	valid_1's l2: 0.467645
mean_16_2017: 3805722.85
mean_14_2017: 2844866.74
mean_7_2017: 948933.32
mean_3_2017: 39209.20
Step 12




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.465984	valid_1's l2: 0.474866
[100]	training's l2: 0.460127	valid_1's l2: 0.477987
Early stopping, best iteration is:
[50]	training's l2: 0.465984	valid_1's l2: 0.474866
mean_16_2017: 6893409.95
mean_14_2017: 926158.08
mean_3_2017: 471974.34
mean_7_2017: 415067.56
Step 13




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.489772	valid_1's l2: 0.479589
[100]	training's l2: 0.482043	valid_1's l2: 0.478059
[150]	training's l2: 0.480801	valid_1's l2: 0.477105
[200]	training's l2: 0.479965	valid_1's l2: 0.476348
[250]	training's l2: 0.479292	valid_1's l2: 0.475862
[300]	training's l2: 0.478793	valid_1's l2: 0.475489
[350]	training's l2: 0.478326	valid_1's l2: 0.475254
[400]	training's l2: 0.477938	valid_1's l2: 0.475127
[450]	training's l2: 0.477568	valid_1's l2: 0.474979
[500]	training's l2: 0.477227	valid_1's l2: 0.47487
[550]	training's l2: 0.476912	valid_1's l2: 0.474791
[600]	training's l2: 0.476621	valid_1's l2: 0.474791
Early stopping, best iteration is:
[558]	training's l2: 0.476869	valid_1's l2: 0.47477
mean_16_2017: 7382575.47
mean_3_2017: 1196843.01
mean_14_2017: 835283.41
mean_7_2017: 45159.32
Step 14




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.45484	valid_1's l2: 0.508133
[100]	training's l2: 0.449581	valid_1's l2: 0.498596
[150]	training's l2: 0.448914	valid_1's l2: 0.49794
[200]	training's l2: 0.448358	valid_1's l2: 0.497613
[250]	training's l2: 0.447893	valid_1's l2: 0.497359
[300]	training's l2: 0.447521	valid_1's l2: 0.497173
[350]	training's l2: 0.447156	valid_1's l2: 0.497009
[400]	training's l2: 0.446837	valid_1's l2: 0.496957
[450]	training's l2: 0.446517	valid_1's l2: 0.496833
[500]	training's l2: 0.446215	valid_1's l2: 0.496815
[550]	training's l2: 0.445939	valid_1's l2: 0.496747
[600]	training's l2: 0.445688	valid_1's l2: 0.496668
Early stopping, best iteration is:
[581]	training's l2: 0.445786	valid_1's l2: 0.496622
mean_16_2017: 6114007.08
mean_14_2017: 892541.73
mean_3_2017: 306482.52
mean_7_2017: 266556.08
Step 15




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.453672	valid_1's l2: 0.582683
[100]	training's l2: 0.448899	valid_1's l2: 0.569345
[150]	training's l2: 0.448322	valid_1's l2: 0.568518
[200]	training's l2: 0.447871	valid_1's l2: 0.568275
[250]	training's l2: 0.447454	valid_1's l2: 0.56802
[300]	training's l2: 0.447108	valid_1's l2: 0.567884
[350]	training's l2: 0.446763	valid_1's l2: 0.567597
[400]	training's l2: 0.446472	valid_1's l2: 0.567535
[450]	training's l2: 0.446183	valid_1's l2: 0.567485
[500]	training's l2: 0.445913	valid_1's l2: 0.567417
[550]	training's l2: 0.445656	valid_1's l2: 0.567248
[600]	training's l2: 0.445403	valid_1's l2: 0.567087
[650]	training's l2: 0.445173	valid_1's l2: 0.567004
[700]	training's l2: 0.44494	valid_1's l2: 0.567075
Early stopping, best iteration is:
[663]	training's l2: 0.445115	valid_1's l2: 0.566973
mean_16_2017: 5354421.58
mean_7_2017: 857986.75
mean_14_2017: 757323.21
mean_3_2017: 78267.09
Step 16




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.457156	valid_1's l2: 0.467861
[100]	training's l2: 0.452134	valid_1's l2: 0.462394
[150]	training's l2: 0.451494	valid_1's l2: 0.462099
[200]	training's l2: 0.450996	valid_1's l2: 0.461889
[250]	training's l2: 0.450589	valid_1's l2: 0.461797
[300]	training's l2: 0.450213	valid_1's l2: 0.461739
[350]	training's l2: 0.449868	valid_1's l2: 0.461642
[400]	training's l2: 0.44955	valid_1's l2: 0.461615
[450]	training's l2: 0.449255	valid_1's l2: 0.461626
Early stopping, best iteration is:
[435]	training's l2: 0.449346	valid_1's l2: 0.461579
mean_16_2017: 5397576.08
mean_14_2017: 1240679.91
mean_7_2017: 639201.74
mean_3_2017: 43652.32
Validation mse: 0.46295235234770776


In [13]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_1.csv', float_format='%.4f', index=None)

Making submission...
