In [14]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib as plp
import lightgbm as lgb

In [15]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [16]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [17]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [18]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [19]:
items = items.reindex(df_train.index.get_level_values(1))

In [20]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [21]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [22]:
print("Preparing dataset...")
t2017 = date(2017, 5, 16)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [23]:
X_val, y_val = prepare_dataset(date(2017, 7, 23))

In [24]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [25]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))



Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.45991	valid_1's l2: 0.577641
[100]	training's l2: 0.376471	valid_1's l2: 0.459938
[150]	training's l2: 0.364744	valid_1's l2: 0.43592
[200]	training's l2: 0.362864	valid_1's l2: 0.429516
[250]	training's l2: 0.36235	valid_1's l2: 0.42762
[300]	training's l2: 0.362052	valid_1's l2: 0.426967
[350]	training's l2: 0.361826	valid_1's l2: 0.426736
[400]	training's l2: 0.361616	valid_1's l2: 0.426613
[450]	training's l2: 0.361423	valid_1's l2: 0.426609
[500]	training's l2: 0.361236	valid_1's l2: 0.426563
[550]	training's l2: 0.361051	valid_1's l2: 0.426514
Early stopping, best iteration is:
[544]	training's l2: 0.361074	valid_1's l2: 0.426506
mean_7_2017: 7853531.27
mean_14_2017: 7769878.44
mean_16_2017: 1755879.00
mean_3_2017: 1138635.64
Step 2




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.473812	valid_1's l2: 0.481605
[100]	training's l2: 0.388944	valid_1's l2: 0.393762
[150]	training's l2: 0.377053	valid_1's l2: 0.380619
[200]	training's l2: 0.375119	valid_1's l2: 0.378371
[250]	training's l2: 0.374575	valid_1's l2: 0.377851
[300]	training's l2: 0.374244	valid_1's l2: 0.377687
[350]	training's l2: 0.373999	valid_1's l2: 0.377623
[400]	training's l2: 0.373772	valid_1's l2: 0.377551
[450]	training's l2: 0.373567	valid_1's l2: 0.377535
[500]	training's l2: 0.373367	valid_1's l2: 0.377518
[550]	training's l2: 0.373182	valid_1's l2: 0.377526
Early stopping, best iteration is:
[518]	training's l2: 0.373298	valid_1's l2: 0.37751
mean_14_2017: 11673125.24
mean_7_2017: 5071917.07
mean_16_2017: 1957731.69
mean_3_2017: 191834.47
Step 3




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.462622	valid_1's l2: 0.482599
[100]	training's l2: 0.39142	valid_1's l2: 0.397672
[150]	training's l2: 0.381451	valid_1's l2: 0.382537
[200]	training's l2: 0.379809	valid_1's l2: 0.379085
[250]	training's l2: 0.379331	valid_1's l2: 0.378069
[300]	training's l2: 0.379034	valid_1's l2: 0.377709
[350]	training's l2: 0.378796	valid_1's l2: 0.377564
[400]	training's l2: 0.378578	valid_1's l2: 0.37749
[450]	training's l2: 0.378378	valid_1's l2: 0.377443
[500]	training's l2: 0.37819	valid_1's l2: 0.377438
[550]	training's l2: 0.378009	valid_1's l2: 0.377423
[600]	training's l2: 0.377827	valid_1's l2: 0.377406
[650]	training's l2: 0.377642	valid_1's l2: 0.377423
Early stopping, best iteration is:
[605]	training's l2: 0.377807	valid_1's l2: 0.377403
mean_14_2017: 10835362.60
mean_7_2017: 3041544.80
mean_16_2017: 1862618.77
mean_3_2017: 158002.93
Step 4




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.508974	valid_1's l2: 0.46811
[100]	training's l2: 0.425847	valid_1's l2: 0.389473
[150]	training's l2: 0.414194	valid_1's l2: 0.379587
[200]	training's l2: 0.412274	valid_1's l2: 0.378598
[250]	training's l2: 0.411716	valid_1's l2: 0.378617
Early stopping, best iteration is:
[216]	training's l2: 0.412041	valid_1's l2: 0.378571
mean_14_2017: 12615241.44
mean_7_2017: 3497484.27
mean_16_2017: 2248396.26
mean_3_2017: 130618.17
Step 5




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.528933	valid_1's l2: 0.515605
[100]	training's l2: 0.432522	valid_1's l2: 0.466733
[150]	training's l2: 0.419156	valid_1's l2: 0.468946
Early stopping, best iteration is:
[111]	training's l2: 0.427098	valid_1's l2: 0.466389
mean_16_2017: 10953109.11
mean_14_2017: 8162612.71
mean_3_2017: 1072435.43
mean_7_2017: 961183.63
Step 6




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.561106	valid_1's l2: 0.527797
[100]	training's l2: 0.457057	valid_1's l2: 0.454558
[150]	training's l2: 0.442026	valid_1's l2: 0.448312
[200]	training's l2: 0.439342	valid_1's l2: 0.44857
Early stopping, best iteration is:
[160]	training's l2: 0.441162	valid_1's l2: 0.448206
mean_16_2017: 16365474.76
mean_3_2017: 3739305.54
mean_14_2017: 2634265.24
mean_7_2017: 128582.07
Step 7




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.511244	valid_1's l2: 0.596117
[100]	training's l2: 0.430123	valid_1's l2: 0.494688
[150]	training's l2: 0.418874	valid_1's l2: 0.475733
[200]	training's l2: 0.417049	valid_1's l2: 0.471112
[250]	training's l2: 0.416542	valid_1's l2: 0.469751
[300]	training's l2: 0.416248	valid_1's l2: 0.469336
[350]	training's l2: 0.41601	valid_1's l2: 0.469168
[400]	training's l2: 0.415789	valid_1's l2: 0.469129
[450]	training's l2: 0.415588	valid_1's l2: 0.469089
[500]	training's l2: 0.415379	valid_1's l2: 0.46909
[550]	training's l2: 0.415185	valid_1's l2: 0.469076
[600]	training's l2: 0.414995	valid_1's l2: 0.469078
Early stopping, best iteration is:
[567]	training's l2: 0.415121	valid_1's l2: 0.46905
mean_16_2017: 13179435.69
mean_14_2017: 3345064.91
mean_7_2017: 1226764.68
mean_3_2017: 377222.64
Step 8




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.507179	valid_1's l2: 0.679314
[100]	training's l2: 0.431653	valid_1's l2: 0.567121
[150]	training's l2: 0.421154	valid_1's l2: 0.54324
[200]	training's l2: 0.419434	valid_1's l2: 0.536633
[250]	training's l2: 0.418944	valid_1's l2: 0.534521
[300]	training's l2: 0.418647	valid_1's l2: 0.533714
[350]	training's l2: 0.418397	valid_1's l2: 0.533389
[400]	training's l2: 0.41817	valid_1's l2: 0.533216
[450]	training's l2: 0.417955	valid_1's l2: 0.533082
[500]	training's l2: 0.417749	valid_1's l2: 0.533037
[550]	training's l2: 0.41755	valid_1's l2: 0.53301
[600]	training's l2: 0.417348	valid_1's l2: 0.532961
[650]	training's l2: 0.417152	valid_1's l2: 0.532916
Early stopping, best iteration is:
[636]	training's l2: 0.41721	valid_1's l2: 0.532881
mean_14_2017: 9052491.13
mean_16_2017: 5862662.70
mean_7_2017: 1858650.17
mean_3_2017: 143329.92
Step 9




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.514912	valid_1's l2: 0.556123
[100]	training's l2: 0.436428	valid_1's l2: 0.466052
[150]	training's l2: 0.425472	valid_1's l2: 0.45077
[200]	training's l2: 0.423651	valid_1's l2: 0.447508
[250]	training's l2: 0.423127	valid_1's l2: 0.446602
[300]	training's l2: 0.4228	valid_1's l2: 0.446282
[350]	training's l2: 0.422541	valid_1's l2: 0.446167
[400]	training's l2: 0.422295	valid_1's l2: 0.446095
[450]	training's l2: 0.422075	valid_1's l2: 0.446067
[500]	training's l2: 0.421865	valid_1's l2: 0.446048
[550]	training's l2: 0.421662	valid_1's l2: 0.446047
[600]	training's l2: 0.421466	valid_1's l2: 0.446088
Early stopping, best iteration is:
[551]	training's l2: 0.421658	valid_1's l2: 0.446045
mean_14_2017: 11123039.71
mean_16_2017: 4243678.16
mean_7_2017: 2072276.99
mean_3_2017: 115100.41
Step 10




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.495088	valid_1's l2: 0.673137
[100]	training's l2: 0.42762	valid_1's l2: 0.569866
[150]	training's l2: 0.418128	valid_1's l2: 0.547016
[200]	training's l2: 0.41651	valid_1's l2: 0.540501
[250]	training's l2: 0.416013	valid_1's l2: 0.538313
[300]	training's l2: 0.415696	valid_1's l2: 0.53743
[350]	training's l2: 0.415437	valid_1's l2: 0.537069
[400]	training's l2: 0.415203	valid_1's l2: 0.5368
[450]	training's l2: 0.414987	valid_1's l2: 0.536674
[500]	training's l2: 0.414772	valid_1's l2: 0.53662
[550]	training's l2: 0.414566	valid_1's l2: 0.536571
[600]	training's l2: 0.414368	valid_1's l2: 0.536554
[650]	training's l2: 0.414173	valid_1's l2: 0.5365
Early stopping, best iteration is:
[634]	training's l2: 0.414237	valid_1's l2: 0.53647
mean_14_2017: 8016836.62
mean_16_2017: 6025628.15
mean_7_2017: 972654.89
mean_3_2017: 103478.75
Step 11




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.542547	valid_1's l2: 0.587291
[100]	training's l2: 0.463839	valid_1's l2: 0.490933
[150]	training's l2: 0.45275	valid_1's l2: 0.473377
[200]	training's l2: 0.450839	valid_1's l2: 0.469259
[250]	training's l2: 0.450258	valid_1's l2: 0.468063
[300]	training's l2: 0.449905	valid_1's l2: 0.467667
[350]	training's l2: 0.44962	valid_1's l2: 0.46748
[400]	training's l2: 0.449358	valid_1's l2: 0.467394
[450]	training's l2: 0.449108	valid_1's l2: 0.46737
[500]	training's l2: 0.448878	valid_1's l2: 0.467371
Early stopping, best iteration is:
[492]	training's l2: 0.448914	valid_1's l2: 0.467364
mean_16_2017: 9886940.67
mean_14_2017: 7053177.83
mean_7_2017: 559494.18
mean_3_2017: 111604.68
Step 12




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.565252	valid_1's l2: 0.533663
[100]	training's l2: 0.474453	valid_1's l2: 0.476373
[150]	training's l2: 0.461692	valid_1's l2: 0.474454
Early stopping, best iteration is:
[128]	training's l2: 0.464611	valid_1's l2: 0.47404
mean_16_2017: 16208218.43
mean_14_2017: 3185080.79
mean_3_2017: 487866.76
mean_7_2017: 171993.92
Step 13




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.59607	valid_1's l2: 0.552281
[100]	training's l2: 0.497824	valid_1's l2: 0.482309
[150]	training's l2: 0.483304	valid_1's l2: 0.475947
[200]	training's l2: 0.480487	valid_1's l2: 0.475728
[250]	training's l2: 0.479528	valid_1's l2: 0.475657
[300]	training's l2: 0.479014	valid_1's l2: 0.475504
[350]	training's l2: 0.478622	valid_1's l2: 0.475402
[400]	training's l2: 0.478302	valid_1's l2: 0.475304
[450]	training's l2: 0.477991	valid_1's l2: 0.475125
[500]	training's l2: 0.477708	valid_1's l2: 0.475007
[550]	training's l2: 0.477456	valid_1's l2: 0.474942
[600]	training's l2: 0.477224	valid_1's l2: 0.474912
[650]	training's l2: 0.477	valid_1's l2: 0.474844
[700]	training's l2: 0.476785	valid_1's l2: 0.474824
Early stopping, best iteration is:
[696]	training's l2: 0.476802	valid_1's l2: 0.474812
mean_16_2017: 16765772.59
mean_14_2017: 2871061.30
mean_3_2017: 2005837.98
mean_7_2017: 110979.07
Step 14




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.539635	valid_1's l2: 0.612142
[100]	training's l2: 0.461833	valid_1's l2: 0.51955
[150]	training's l2: 0.450751	valid_1's l2: 0.502711
[200]	training's l2: 0.448773	valid_1's l2: 0.498588
[250]	training's l2: 0.448164	valid_1's l2: 0.497312
[300]	training's l2: 0.44781	valid_1's l2: 0.496861
[350]	training's l2: 0.447512	valid_1's l2: 0.496634
[400]	training's l2: 0.447245	valid_1's l2: 0.496537
[450]	training's l2: 0.446985	valid_1's l2: 0.496402
[500]	training's l2: 0.446744	valid_1's l2: 0.496341
[550]	training's l2: 0.446513	valid_1's l2: 0.49627
[600]	training's l2: 0.446299	valid_1's l2: 0.496227
[650]	training's l2: 0.446089	valid_1's l2: 0.496173
[700]	training's l2: 0.445889	valid_1's l2: 0.496149
[750]	training's l2: 0.445698	valid_1's l2: 0.496126
Early stopping, best iteration is:
[740]	training's l2: 0.445738	valid_1's l2: 0.496105
mean_16_2017: 14118852.01
mean_14_2017: 2759440.48
mean_7_2



Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.532407	valid_1's l2: 0.700643
[100]	training's l2: 0.460248	valid_1's l2: 0.597684
[150]	training's l2: 0.450073	valid_1's l2: 0.576263
[200]	training's l2: 0.448307	valid_1's l2: 0.570487
[250]	training's l2: 0.447751	valid_1's l2: 0.568552
[300]	training's l2: 0.447413	valid_1's l2: 0.567805
[350]	training's l2: 0.447138	valid_1's l2: 0.567502
[400]	training's l2: 0.446886	valid_1's l2: 0.567286
[450]	training's l2: 0.446642	valid_1's l2: 0.567162
[500]	training's l2: 0.446414	valid_1's l2: 0.56705
[550]	training's l2: 0.44619	valid_1's l2: 0.566965
[600]	training's l2: 0.445967	valid_1's l2: 0.566895
[650]	training's l2: 0.445753	valid_1's l2: 0.566813
[700]	training's l2: 0.445553	valid_1's l2: 0.566739
[750]	training's l2: 0.445356	valid_1's l2: 0.566759
Early stopping, best iteration is:
[707]	training's l2: 0.445526	valid_1's l2: 0.566726
mean_16_2017: 13096858.84
mean_14_2017: 2554206.61
mean_7_



Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.539131	valid_1's l2: 0.552223
[100]	training's l2: 0.464018	valid_1's l2: 0.475322
[150]	training's l2: 0.453404	valid_1's l2: 0.464057
[200]	training's l2: 0.451575	valid_1's l2: 0.462185
[250]	training's l2: 0.451001	valid_1's l2: 0.461728
[300]	training's l2: 0.450644	valid_1's l2: 0.461588
[350]	training's l2: 0.450362	valid_1's l2: 0.461529
[400]	training's l2: 0.450099	valid_1's l2: 0.461486
[450]	training's l2: 0.449853	valid_1's l2: 0.461442
[500]	training's l2: 0.449614	valid_1's l2: 0.46142
[550]	training's l2: 0.449384	valid_1's l2: 0.461414
[600]	training's l2: 0.449165	valid_1's l2: 0.461376
[650]	training's l2: 0.44896	valid_1's l2: 0.46133
[700]	training's l2: 0.448749	valid_1's l2: 0.461322
[750]	training's l2: 0.448549	valid_1's l2: 0.461329
Early stopping, best iteration is:
[701]	training's l2: 0.448745	valid_1's l2: 0.46132
mean_16_2017: 13547175.80
mean_14_2017: 2891551.01
mean_7_20

In [26]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_1.csv', float_format='%.4f', index=None)

Making submission...
