In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from fastai import *
from fastai.tabular import *

# Rossmann

## Data Preparation

To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run `rossman_data_clean.ipynb`. One important step that deals with time series is this
    
```python
add_datepart(train, \"Date\", drop=False)
add_datepart(test, \"Date\", drop=False)
```

In [4]:
path = Path('data/rossmann/')
train_df = pd.read_pickle(path/'train_clean')

In [5]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
Store,1,2,3,4,5
DayOfWeek,5,5,5,5,5
Date,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00
Sales,5263,6064,8314,13995,4822
Customers,555,625,821,1498,559
Open,1,1,1,1,1
Promo,1,1,1,1,1
StateHoliday,False,False,False,False,False
SchoolHoliday,1,1,1,1,1


In [6]:
n = len(train_df); n

844338

# Experimenting with a sample

In [7]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]

In [8]:
small_train_df.head()


Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
539,541,5,,650.0,61,9015
1117,5,4,,29910.0,51,4943
1281,169,4,"Feb,May,Aug,Nov",980.0,73,7626
1333,221,4,,13530.0,86,6865
1698,587,4,"Jan,Apr,Jul,Oct",330.0,59,11542


In [9]:
small_test_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
419105,536,4,"Feb,May,Aug,Nov",4700.0,56,9049
419206,637,4,"Feb,May,Aug,Nov",9790.0,67,9102
421738,943,2,,18020.0,47,7382
421842,1047,2,"Feb,May,Aug,Nov",3750.0,47,6311
422800,891,1,"Feb,May,Aug,Nov",350.0,54,13924


In [10]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)

In [11]:
small_test_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
419105,536.0,4,"Feb,May,Aug,Nov",4700.0,56,9049
419206,,4,"Feb,May,Aug,Nov",9790.0,67,9102
421738,943.0,2,,18020.0,47,7382
421842,1047.0,2,"Feb,May,Aug,Nov",3750.0,47,6311
422800,,1,"Feb,May,Aug,Nov",350.0,54,13924


In [12]:
small_train_df.PromoInterval.cat.categories

Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')

In [13]:
small_train_df['PromoInterval'].cat.codes[:5]

539    -1
1117   -1
1281    0
1333   -1
1698    1
dtype: int8

In [14]:
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)

Exception: There are nan values in field CompetitionDistance but there were none in the training set. 
                Please fix those manually.

In [15]:
small_train_df[small_train_df['CompetitionDistance_na'] == True]

KeyError: 'CompetitionDistance_na'