In [5]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib as plp
import lightgbm as lgb

In [6]:
df_train = pd.read_csv(
    '../Data/train_set.csv', usecols=[1, 2, 3, 4],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

In [7]:
df_test = pd.read_csv(
    "../Data/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [8]:
items = pd.read_csv(
    "../Data/items.csv",
).set_index("item_nbr")

In [9]:
df_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_train.columns = df_train.columns.get_level_values(1)

In [10]:
items = items.reindex(df_train.index.get_level_values(1))

In [11]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [12]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_train, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_train, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_train, t2017, 14, 14).mean(axis=1).values,
        "mean_16_2017": get_timespan(df_train, t2017, 16, 16).mean(axis=1).values
    })
    if is_train:
        y = df_train[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [13]:
print("Preparing dataset...")
t2017 = date(2017, 5, 16)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(t2017 + delta)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

Preparing dataset...


In [14]:
X_val, y_val = prepare_dataset(date(2017, 7, 23))

In [15]:
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [17]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 9) * 0.25 + 1
    )
    
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        categorical_feature=cate_vars,
        weight=items["perishable"] * 0.25 + 1
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.464951	valid_1's l2: 0.579554
[100]	training's l2: 0.380001	valid_1's l2: 0.460505
[150]	training's l2: 0.368057	valid_1's l2: 0.436342
[200]	training's l2: 0.366136	valid_1's l2: 0.429946
[250]	training's l2: 0.365604	valid_1's l2: 0.42805
[300]	training's l2: 0.365295	valid_1's l2: 0.427396
[350]	training's l2: 0.365064	valid_1's l2: 0.427158
[400]	training's l2: 0.364848	valid_1's l2: 0.42704
[450]	training's l2: 0.364648	valid_1's l2: 0.427032
[500]	training's l2: 0.364455	valid_1's l2: 0.426979
[550]	training's l2: 0.364261	valid_1's l2: 0.426946
[600]	training's l2: 0.364072	valid_1's l2: 0.426988
Early stopping, best iteration is:
[551]	training's l2: 0.364257	valid_1's l2: 0.426942
mean_14_2017: 8769569.52
mean_7_2017: 8301834.56
mean_16_2017: 1758688.03
mean_3_2017: 1083723.72
Step 2




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.478981	valid_1's l2: 0.483198
[100]	training's l2: 0.391964	valid_1's l2: 0.394729
[150]	training's l2: 0.379774	valid_1's l2: 0.381855
[200]	training's l2: 0.377792	valid_1's l2: 0.379788
[250]	training's l2: 0.37723	valid_1's l2: 0.379337
[300]	training's l2: 0.376895	valid_1's l2: 0.379195
[350]	training's l2: 0.37664	valid_1's l2: 0.379143
[400]	training's l2: 0.376406	valid_1's l2: 0.379091
[450]	training's l2: 0.376189	valid_1's l2: 0.379078
[500]	training's l2: 0.375986	valid_1's l2: 0.379069
[550]	training's l2: 0.375791	valid_1's l2: 0.379066
Early stopping, best iteration is:
[524]	training's l2: 0.37589	valid_1's l2: 0.379055
mean_14_2017: 12773027.38
mean_7_2017: 5310976.57
mean_16_2017: 2173881.34
mean_3_2017: 200379.92
Step 3




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.465703	valid_1's l2: 0.487723
[100]	training's l2: 0.393105	valid_1's l2: 0.40121
[150]	training's l2: 0.382938	valid_1's l2: 0.38582
[200]	training's l2: 0.381266	valid_1's l2: 0.382328
[250]	training's l2: 0.380777	valid_1's l2: 0.381295
[300]	training's l2: 0.38047	valid_1's l2: 0.380925
[350]	training's l2: 0.380228	valid_1's l2: 0.380773
[400]	training's l2: 0.380005	valid_1's l2: 0.380698
[450]	training's l2: 0.379799	valid_1's l2: 0.380651
[500]	training's l2: 0.379608	valid_1's l2: 0.380632
[550]	training's l2: 0.379423	valid_1's l2: 0.380599
[600]	training's l2: 0.379238	valid_1's l2: 0.380586
[650]	training's l2: 0.379054	valid_1's l2: 0.380597
Early stopping, best iteration is:
[602]	training's l2: 0.37923	valid_1's l2: 0.380582
mean_14_2017: 11649181.52
mean_7_2017: 3350611.02
mean_16_2017: 1955409.53
mean_3_2017: 162224.98
Step 4




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.512816	valid_1's l2: 0.47343
[100]	training's l2: 0.427904	valid_1's l2: 0.392492
[150]	training's l2: 0.416008	valid_1's l2: 0.382224
[200]	training's l2: 0.414037	valid_1's l2: 0.381161
[250]	training's l2: 0.413455	valid_1's l2: 0.381167
Early stopping, best iteration is:
[216]	training's l2: 0.413797	valid_1's l2: 0.381118
mean_14_2017: 13698272.51
mean_7_2017: 4021390.86
mean_16_2017: 2086858.15
mean_3_2017: 137450.68
Step 5




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.530927	valid_1's l2: 0.518435
[100]	training's l2: 0.433195	valid_1's l2: 0.468258
[150]	training's l2: 0.419643	valid_1's l2: 0.470275
Early stopping, best iteration is:
[112]	training's l2: 0.427302	valid_1's l2: 0.467844
mean_16_2017: 11250356.31
mean_14_2017: 9304527.95
mean_3_2017: 1155191.75
mean_7_2017: 936043.16
Step 6




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.563707	valid_1's l2: 0.531267
[100]	training's l2: 0.458219	valid_1's l2: 0.456205
[150]	training's l2: 0.442993	valid_1's l2: 0.449587
[200]	training's l2: 0.440278	valid_1's l2: 0.449733
Early stopping, best iteration is:
[160]	training's l2: 0.442114	valid_1's l2: 0.449453
mean_16_2017: 17509311.11
mean_3_2017: 4154135.20
mean_14_2017: 2670963.08
mean_7_2017: 143780.23
Step 7




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.514785	valid_1's l2: 0.595608
[100]	training's l2: 0.432278	valid_1's l2: 0.493438
[150]	training's l2: 0.420839	valid_1's l2: 0.474511
[200]	training's l2: 0.418986	valid_1's l2: 0.469938
[250]	training's l2: 0.418465	valid_1's l2: 0.468615
[300]	training's l2: 0.418167	valid_1's l2: 0.468189
[350]	training's l2: 0.417924	valid_1's l2: 0.468027
[400]	training's l2: 0.4177	valid_1's l2: 0.467973
[450]	training's l2: 0.417492	valid_1's l2: 0.467921
[500]	training's l2: 0.41729	valid_1's l2: 0.467906
[550]	training's l2: 0.417091	valid_1's l2: 0.467886
[600]	training's l2: 0.416899	valid_1's l2: 0.467907
Early stopping, best iteration is:
[567]	training's l2: 0.417026	valid_1's l2: 0.467863
mean_16_2017: 14221102.35
mean_14_2017: 3519601.89
mean_7_2017: 1339508.50
mean_3_2017: 391676.33
Step 8




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.511959	valid_1's l2: 0.679019
[100]	training's l2: 0.434904	valid_1's l2: 0.565912
[150]	training's l2: 0.424197	valid_1's l2: 0.541992
[200]	training's l2: 0.422446	valid_1's l2: 0.535444
[250]	training's l2: 0.421939	valid_1's l2: 0.533356
[300]	training's l2: 0.421626	valid_1's l2: 0.532534
[350]	training's l2: 0.421372	valid_1's l2: 0.532201
[400]	training's l2: 0.421141	valid_1's l2: 0.532008
[450]	training's l2: 0.420927	valid_1's l2: 0.531893
[500]	training's l2: 0.420719	valid_1's l2: 0.531824
[550]	training's l2: 0.420515	valid_1's l2: 0.531795
[600]	training's l2: 0.420305	valid_1's l2: 0.531749
[650]	training's l2: 0.420105	valid_1's l2: 0.531717
Early stopping, best iteration is:
[636]	training's l2: 0.420164	valid_1's l2: 0.53168
mean_14_2017: 9789079.32
mean_16_2017: 6185340.78
mean_7_2017: 2099501.64
mean_3_2017: 150769.69
Step 9




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.519939	valid_1's l2: 0.555399
[100]	training's l2: 0.439304	valid_1's l2: 0.464933
[150]	training's l2: 0.428044	valid_1's l2: 0.450063
[200]	training's l2: 0.426172	valid_1's l2: 0.447028
[250]	training's l2: 0.425632	valid_1's l2: 0.446224
[300]	training's l2: 0.425299	valid_1's l2: 0.44595
[350]	training's l2: 0.425037	valid_1's l2: 0.445861
[400]	training's l2: 0.424786	valid_1's l2: 0.445786
[450]	training's l2: 0.424559	valid_1's l2: 0.445771
[500]	training's l2: 0.424347	valid_1's l2: 0.445748
[550]	training's l2: 0.424141	valid_1's l2: 0.445763
Early stopping, best iteration is:
[513]	training's l2: 0.424294	valid_1's l2: 0.445741
mean_14_2017: 12107218.81
mean_16_2017: 4483061.54
mean_7_2017: 2321017.35
mean_3_2017: 122636.89
Step 10




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.497946	valid_1's l2: 0.674721
[100]	training's l2: 0.429068	valid_1's l2: 0.570367
[150]	training's l2: 0.419386	valid_1's l2: 0.547475
[200]	training's l2: 0.417734	valid_1's l2: 0.540965
[250]	training's l2: 0.417225	valid_1's l2: 0.538758
[300]	training's l2: 0.416908	valid_1's l2: 0.537851
[350]	training's l2: 0.416652	valid_1's l2: 0.53745
[400]	training's l2: 0.416414	valid_1's l2: 0.537191
[450]	training's l2: 0.41619	valid_1's l2: 0.537043
[500]	training's l2: 0.415974	valid_1's l2: 0.536992
[550]	training's l2: 0.415767	valid_1's l2: 0.536935
[600]	training's l2: 0.415564	valid_1's l2: 0.53693
Early stopping, best iteration is:
[564]	training's l2: 0.415714	valid_1's l2: 0.536891
mean_14_2017: 9060676.90
mean_16_2017: 6050216.37
mean_7_2017: 1064236.13
mean_3_2017: 108180.34
Step 11




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.546371	valid_1's l2: 0.589764
[100]	training's l2: 0.465891	valid_1's l2: 0.491757
[150]	training's l2: 0.454557	valid_1's l2: 0.473985
[200]	training's l2: 0.452608	valid_1's l2: 0.469844
[250]	training's l2: 0.452007	valid_1's l2: 0.468629
[300]	training's l2: 0.451646	valid_1's l2: 0.468243
[350]	training's l2: 0.451353	valid_1's l2: 0.468075
[400]	training's l2: 0.451088	valid_1's l2: 0.467986
[450]	training's l2: 0.450836	valid_1's l2: 0.467951
[500]	training's l2: 0.450602	valid_1's l2: 0.467945
Early stopping, best iteration is:
[492]	training's l2: 0.450637	valid_1's l2: 0.467936
mean_16_2017: 9462939.16
mean_14_2017: 8744602.28
mean_7_2017: 683615.88
mean_3_2017: 121536.15
Step 12




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.567054	valid_1's l2: 0.535253
[100]	training's l2: 0.474947	valid_1's l2: 0.476747
[150]	training's l2: 0.462016	valid_1's l2: 0.474724
Early stopping, best iteration is:
[128]	training's l2: 0.46497	valid_1's l2: 0.474308
mean_16_2017: 17361822.42
mean_14_2017: 3390148.20
mean_3_2017: 524462.36
mean_7_2017: 205392.81
Step 13




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.598509	valid_1's l2: 0.555177
[100]	training's l2: 0.498834	valid_1's l2: 0.48316
[150]	training's l2: 0.484125	valid_1's l2: 0.47638
[200]	training's l2: 0.481281	valid_1's l2: 0.47602
[250]	training's l2: 0.480306	valid_1's l2: 0.475944
[300]	training's l2: 0.479792	valid_1's l2: 0.475775
[350]	training's l2: 0.479394	valid_1's l2: 0.475639
[400]	training's l2: 0.479071	valid_1's l2: 0.475542
[450]	training's l2: 0.478758	valid_1's l2: 0.475373
[500]	training's l2: 0.478474	valid_1's l2: 0.47527
[550]	training's l2: 0.478214	valid_1's l2: 0.475184
[600]	training's l2: 0.477973	valid_1's l2: 0.475156
[650]	training's l2: 0.477737	valid_1's l2: 0.475105
[700]	training's l2: 0.477518	valid_1's l2: 0.475061
Early stopping, best iteration is:
[690]	training's l2: 0.477559	valid_1's l2: 0.475042
mean_16_2017: 17897166.47
mean_14_2017: 3028807.72
mean_3_2017: 2262064.54
mean_7_2017: 118613.21
Step 14




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.543072	valid_1's l2: 0.611643
[100]	training's l2: 0.463885	valid_1's l2: 0.517841
[150]	training's l2: 0.452618	valid_1's l2: 0.500924
[200]	training's l2: 0.450613	valid_1's l2: 0.496807
[250]	training's l2: 0.44999	valid_1's l2: 0.495556
[300]	training's l2: 0.449629	valid_1's l2: 0.495117
[350]	training's l2: 0.449325	valid_1's l2: 0.494891
[400]	training's l2: 0.449055	valid_1's l2: 0.49477
[450]	training's l2: 0.44879	valid_1's l2: 0.494665
[500]	training's l2: 0.448542	valid_1's l2: 0.494612
[550]	training's l2: 0.448309	valid_1's l2: 0.49454
[600]	training's l2: 0.448088	valid_1's l2: 0.494503
[650]	training's l2: 0.447875	valid_1's l2: 0.494454
[700]	training's l2: 0.447676	valid_1's l2: 0.49442
[750]	training's l2: 0.447487	valid_1's l2: 0.494396
Early stopping, best iteration is:
[730]	training's l2: 0.44756	valid_1's l2: 0.494378
mean_16_2017: 15180815.65
mean_14_2017: 2931841.63
mean_7_2017



Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.537159	valid_1's l2: 0.700827
[100]	training's l2: 0.4635	valid_1's l2: 0.596536
[150]	training's l2: 0.45312	valid_1's l2: 0.574926
[200]	training's l2: 0.451327	valid_1's l2: 0.569131
[250]	training's l2: 0.450767	valid_1's l2: 0.567222
[300]	training's l2: 0.450424	valid_1's l2: 0.566455
[350]	training's l2: 0.450145	valid_1's l2: 0.566142
[400]	training's l2: 0.449883	valid_1's l2: 0.565936
[450]	training's l2: 0.449638	valid_1's l2: 0.565838
[500]	training's l2: 0.449398	valid_1's l2: 0.565756
[550]	training's l2: 0.449164	valid_1's l2: 0.565675
[600]	training's l2: 0.448937	valid_1's l2: 0.565613
[650]	training's l2: 0.448718	valid_1's l2: 0.565506
[700]	training's l2: 0.448507	valid_1's l2: 0.565434
[750]	training's l2: 0.448301	valid_1's l2: 0.565451
Early stopping, best iteration is:
[712]	training's l2: 0.448458	valid_1's l2: 0.565419
mean_16_2017: 14109061.89
mean_14_2017: 2736958.14
mean_7_2



Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.54421	valid_1's l2: 0.552233
[100]	training's l2: 0.466982	valid_1's l2: 0.474582
[150]	training's l2: 0.45608	valid_1's l2: 0.463594
[200]	training's l2: 0.454199	valid_1's l2: 0.4619
[250]	training's l2: 0.453606	valid_1's l2: 0.461543
[300]	training's l2: 0.453242	valid_1's l2: 0.461429
[350]	training's l2: 0.452952	valid_1's l2: 0.461393
[400]	training's l2: 0.452687	valid_1's l2: 0.461351
[450]	training's l2: 0.452441	valid_1's l2: 0.461326
[500]	training's l2: 0.452195	valid_1's l2: 0.461313
[550]	training's l2: 0.451963	valid_1's l2: 0.461294
[600]	training's l2: 0.451737	valid_1's l2: 0.461281
[650]	training's l2: 0.45153	valid_1's l2: 0.461257
Early stopping, best iteration is:
[649]	training's l2: 0.451535	valid_1's l2: 0.461253
mean_16_2017: 14740615.48
mean_14_2017: 3069550.53
mean_7_2017: 360494.40
mean_3_2017: 121108.33
Validation mse: 0.4620377217114561


In [18]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('modified_weight.csv', float_format='%.4f', index=None)

Making submission...
