# Data Preprocessing and Feature Engineering

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Loading-data-and-Libraries" data-toc-modified-id="Loading-data-and-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Loading data and Libraries</a></span></li><li><span><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Scale-numeric-variable" data-toc-modified-id="Scale-numeric-variable-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Scale numeric variable</a></span></li><li><span><a href="#Manipulate-data-in-a-time-series-manner" data-toc-modified-id="Manipulate-data-in-a-time-series-manner-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Manipulate data in a time series manner</a></span></li></ul></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature Engineering</a></span><ul class="toc-item"><li><span><a href="#Numeric-Feature" data-toc-modified-id="Numeric-Feature-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Numeric Feature</a></span></li><li><span><a href="#Categorical-Feature" data-toc-modified-id="Categorical-Feature-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Categorical Feature</a></span></li></ul></li><li><span><a href="#Training,-Validation,-Testing-split" data-toc-modified-id="Training,-Validation,-Testing-split-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training, Validation, Testing split</a></span><ul class="toc-item"><li><span><a href="#Training-set" data-toc-modified-id="Training-set-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Training set</a></span></li><li><span><a href="#Validation-and-Test-Set" data-toc-modified-id="Validation-and-Test-Set-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Validation and Test Set</a></span></li></ul></li></ul></div>

## Loading data and Libraries

In [1]:
import numpy as np
import pandas as pd
import feather
import seaborn as sns
import matplotlib.pyplot as plt
import time
from datetime import date, timedelta
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [2]:
train_raw = feather.read_dataframe("./data/train_raw.feather")

In [3]:
df_test = pd.read_csv(
    "./data/test.csv",
    converters={"onpromotion": lambda p:int(p==True)},
    parse_dates=["date"]
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [4]:
items = pd.read_csv(
    "./data/items.csv",
).set_index("item_nbr")


stores = pd.read_csv(
    "./data/stores.csv",
).set_index("store_nbr")

In [5]:
train_raw.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,88042205,2016-08-15,1,103665,1.0,0
1,88042206,2016-08-15,1,105574,1.0,0
2,88042207,2016-08-15,1,105575,19.0,0
3,88042208,2016-08-15,1,105577,1.0,0
4,88042209,2016-08-15,1,105693,1.0,0


In [6]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,0
1,99197,2017-08-16,125497041,0
1,103501,2017-08-16,125497042,0
1,103520,2017-08-16,125497043,0
1,103665,2017-08-16,125497044,0


## Data Preprocessing

### Scale numeric variable

There is only one numeric variable in the training set, we perform log transform to this variable so that it's more nomralized. <span style="color:red">But we are not going to perform Minmax scaling (scale variable to the range 0-1) here, because if we do this before training and testing split, we risk leaking information from validation set to the training set. </span>

In [7]:
train_raw['unit_sales'] = train_raw['unit_sales'].apply(func=lambda x:np.log1p(x) if float(x)>0 else 0)

### Manipulate data in a time series manner

Fill in NAN with zero, representing there is no sell of an item of that day.

In [8]:
promo_2017_train = train_raw.set_index(["store_nbr", "item_nbr", 
                                      "date"])[["onpromotion"]].unstack(level=-1).fillna(0)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

Did the same for the testset

In [9]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(0)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [10]:
promo_2017_train.head()

Unnamed: 0_level_0,date,2016-08-15 00:00:00,2016-08-16 00:00:00,2016-08-17 00:00:00,2016-08-18 00:00:00,2016-08-19 00:00:00,2016-08-20 00:00:00,2016-08-21 00:00:00,2016-08-22 00:00:00,2016-08-23 00:00:00,2016-08-24 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,105574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print("Shape of time series in training set: ({0}, {1})".format(promo_2017_train.shape[0], promo_2017_train.shape[1]))
print("Shape of time series in testing set: ({0}, {1})".format(promo_2017_test.shape[0], promo_2017_test.shape[1]))

Shape of time series in training set: (170810, 365)
Shape of time series in testing set: (210654, 16)


*** Noticed that the shape of training dataframe is (170810, 365), while the shape of testing dataframe is (210654, 16). That means there exists some (store, item) pair in testing set that we have not seen in the training set.***

Reorder the rows of testing set so that the (store, item) pairs have the same order like the training set. And concatenate the training/testing together so that they merge into one big matrix that we're going to use to train our model

In [12]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(0)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
promo_2017.head()

Unnamed: 0_level_0,date,2016-08-15 00:00:00,2016-08-16 00:00:00,2016-08-17 00:00:00,2016-08-18 00:00:00,2016-08-19 00:00:00,2016-08-20 00:00:00,2016-08-21 00:00:00,2016-08-22 00:00:00,2016-08-23 00:00:00,2016-08-24 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,105574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
start_time = time.time()
df_2017 = train_raw.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
print("It tooks {0} minutes to manipulate".format((time.time()-start_time)/60))

It tooks 0.7984540700912476 minutes to manipulate


In [17]:
df_2017.head()

Unnamed: 0_level_0,date,2016-08-15 00:00:00,2016-08-16 00:00:00,2016-08-17 00:00:00,2016-08-18 00:00:00,2016-08-19 00:00:00,2016-08-20 00:00:00,2016-08-21 00:00:00,2016-08-22 00:00:00,2016-08-23 00:00:00,2016-08-24 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.0,0.693147,0.693147,0.0,0.0,0.693147,0.0,1.098612,0.0,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.693147,0.0,1.94591,0.693147,1.386294,2.197225,1.386294,2.197225,1.098612,0.0,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.693147,1.386294,1.386294,0.0,1.386294,0.0,1.609438,2.564949,0.0,1.386294,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [76]:
feather.write_dataframe(df_2017, "./data/df_2017.feather")

## Feature Engineering

### Numeric Feature

In [15]:
"""
This is an important helper function. Provided by senkin13 from kaggle. 

It returns a data frame, composing of some portion of original dataframe df. 
"""

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [67]:
import pdb
"""
We plan to build on the function provided by senkin. The original function already contains features below:
- unit_sales of t2017 (t2017 is a date)
- mean sales of last 3/7/14/30/60/140 days from t2017. 
- number of promotion in the last 14/60/140 days from t2017.
- mean Mon/Tue/Wed/... sales of last 4/20 weeks. 
- promotion info. of the last 16 days. 

We will add the following features to the original function. 
- original sales information of last 16 days
- original promotion information of last 16 days
- min/max/median/std/skew sales of last 3/7/14/30/60/140 days from t2017
- How much the next day have more/less sales than the previous day
- Same anaysis above for the same day a week ago. 
- mean Mon/Tue/Wed/... sales of last 4/8/16 weeks. 
- Number of promtion in the last 7/14/30/60/140 days

"""
def prepare_dataset(df, promo, t2017, is_train=True, pre=None):
    
    X = {
        # Total number of promotion for each (store, item) pair in the past 14/60/140/200 days.
        "promo_14_2017": get_timespan(promo, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo, t2017, 140, 140).sum(axis=1).values,
    }
    
    for i in range(1, 16):
        X['day_%s' % i] = get_timespan(df, t2017, i, 1).values.ravel()
    
    for i in range(16):
        X["promo_{}".format(i)] = promo[t2017 + timedelta(days=i)].values.astype(np.uint8)
    
    for i in [3,7,14,30,60,140]:
        X['mean_{0}'.format(i)] = get_timespan(df, t2017, i, i).mean(axis=1).values
        X['max_{0}'.format(i)] = get_timespan(df, t2017, i, i).max(axis=1).values
        X['min_{0}'.format(i)] = get_timespan(df, t2017, i, i).min(axis=1).values
        X['std_{0}'.format(i)] = get_timespan(df, t2017, i, i).std(axis=1).values
        X['diff_{0}'.format(i)] = get_timespan(df, t2017, i, i).diff(axis=1).mean(axis=1).values
        X['skew_{0}'.format(i)] = get_timespan(df, t2017, i, i).skew(axis=1).values
    
    for i in [3,7,14,30,60,140]:
        X['mean_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).mean(axis=1).values
        X['max_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).max(axis=1).values
        X['min_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).min(axis=1).values
        X['std_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).std(axis=1).values
        X['diff_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).diff(axis=1).mean(axis=1).values
        X['skew_{0}_7ago'.format(i)] = get_timespan(df, t2017-timedelta(days=7), i, i).skew(axis=1).values
    
    for i in range(7):
        X['mean_4_dow{}'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_8_dow{}'.format(i)] = get_timespan(df, t2017, 56-i, 8, freq='7D').mean(axis=1).values
        X['mean_16_dow{}'.format(i)] = get_timespan(df, t2017, 112-i, 16, freq='7D').mean(axis=1).values
    
    
    for i in [7,14,30,60,140]:
        X["num_pro_{}".format(i)] = (get_timespan(promo,t2017,i,i)>0).sum(axis=1).values
    
    # pdb.set_trace()
    X = pd.DataFrame(X)
    
    if is_train:
        y = df[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    
    if pre:
        X.columns = ["%s_%s" % (pre, c) for c in X.columns]
    
    return X

### Categorical Feature

Reorder the stores and items dataframe, so that they have the same amount of rows as df_2017

In [68]:
stores = stores.reindex(df_2017.index.get_level_values(0))
items = items.reindex(df_2017.index.get_level_values(1))

In [69]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,18,12,3,13
1,18,12,3,13
1,18,12,3,13
1,18,12,3,13
1,18,12,3,13


In [70]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,12,1093,0
99197,12,1067,0
103520,12,1028,0
103665,5,2712,1
105574,12,1045,0


In [71]:
label = LabelEncoder()
items['family'] = label.fit_transform(items['family'].values)
stores['city'] = label.fit_transform(stores['city'].values)
stores['state'] = label.fit_transform(stores['state'].values)
stores['type'] = label.fit_transform(stores['type'].values)

- how many sales/promotion each day for each item.

In [23]:
df_item = df_2017.groupby('item_nbr').sum()
df_item_promo = promo_2017.groupby("item_nbr").sum()

In [24]:
df_2017.head().reset_index()

date,store_nbr,item_nbr,2016-08-15 00:00:00,2016-08-16 00:00:00,2016-08-17 00:00:00,2016-08-18 00:00:00,2016-08-19 00:00:00,2016-08-20 00:00:00,2016-08-21 00:00:00,2016-08-22 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
0,1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
2,1,103520,0.0,0.0,0.693147,0.693147,0.0,0.0,0.693147,0.0,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
3,1,103665,0.693147,0.0,1.94591,0.693147,1.386294,2.197225,1.386294,2.197225,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
4,1,105574,0.693147,1.386294,1.386294,0.0,1.386294,0.0,1.609438,2.564949,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


- sales of each class in different stores.

In [25]:
df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
# Stored (store, class) pair for later use. 
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']] 
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()


- How many promotion for each class in each store. 

In [26]:
df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

## Training, Validation, Testing split

### Training set

In [27]:
del promo_2017_test, promo_2017_train

In [72]:
start_date = date(2017, 5, 31)
start_time = time.time()
num_nodes = 8
X_TRAIN, Y_TRAIN = [], []

for i in range(num_nodes):
    # Analyze every 7 days. 
    delta = timedelta(days=7 * i)
    
    X_sales, y_sales = prepare_dataset(df_2017, promo_2017, start_date+delta)
    
    X_item = prepare_dataset(df_item, df_item_promo,start_date+delta,is_train=False,pre="item")
    X_item.index = df_item.index
    X_item = X_item.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)
    
    X_store = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, pre='store_class')
    X_store.index = df_2017_store_class.index
    X_store = X_store.reindex(df_2017_store_class_index).reset_index(drop=True)
    
    # Add all categorical features includeing family, perishable,... from items and stores.
    X_train = pd.concat([X_sales, X_item, X_store, items.reset_index(), stores.reset_index()], axis=1)
    
    X_TRAIN.append(X_train)
    Y_TRAIN.append(y_sales)

print("It tooks {0} seconds to run".format((time.time()-start_time)))

It tooks 140.55507898330688 seconds to run


In [81]:
X_TRAIN = pd.concat(X_TRAIN, axis=0)
Y_TRAIN = np.concatenate(Y_TRAIN, axis=0)

Save the preprocessed training set, so that we won't need to process it again and again during training. 

In [85]:
Y_TRAIN = pd.DataFrame(Y_TRAIN)

In [88]:
feather.write_dataframe(X_TRAIN, "./data/df_x_train.feather")
feather.write_dataframe(Y_TRAIN, "./data/df_y_train.feather")

In [89]:
del X_TRAIN, Y_TRAIN

### Validation and Test Set

In [90]:
promo_2017.head()

Unnamed: 0_level_0,date,2016-08-15 00:00:00,2016-08-16 00:00:00,2016-08-17 00:00:00,2016-08-18 00:00:00,2016-08-19 00:00:00,2016-08-20 00:00:00,2016-08-21 00:00:00,2016-08-22 00:00:00,2016-08-23 00:00:00,2016-08-24 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,103665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,105574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
X_VAL, y_VAL = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))


val_items = prepare_dataset(df_item, df_item_promo, date(2017, 7, 26), is_train=False, pre='item')
val_items.index = df_item.index
val_items = val_items.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

val_stores = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, pre='store_class')
val_stores.index = df_2017_store_class.index
val_stores = val_stores.reindex(df_2017_store_class_index).reset_index(drop=True)

X_VAL = pd.concat([X_VAL, val_items, val_stores, items.reset_index(), stores.reset_index()], axis=1)

X_TEST = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)
test_items = prepare_dataset(df_item, df_item_promo, date(2017, 8, 16), is_train=False, pre='item')
test_items.index = df_item.index
test_items = test_items.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

test_stores = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, pre='store_class')
test_stores.index = df_2017_store_class.index
test_stores = test_stores.reindex(df_2017_store_class_index).reset_index(drop=True)

X_TEST = pd.concat([X_TEST, test_items, test_stores, items.reset_index(), stores.reset_index()], axis=1)

In [106]:
y_VAL = pd.DataFrame(y_VAL)

In [107]:
#X_VAL = pd.concat([X_VAL, val_items, val_stores, items.reset_index(), stores.reset_index()], axis=1)
#X_TEST = pd.concat([X_TEST, test_items, test_stores, items.reset_index(), stores.reset_index()], axis=1)

In [245]:
feather.write_dataframe(X_VAL, "./data/X_val.feather")
feather.write_dataframe(y_VAL, "./data/y_val.feather")
feather.write_dataframe(X_TEST, "./data/X_test.feather")