In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

# Import all utility functions
import utils

N_PROC = 4

stores = pd.read_csv('../data/store.csv')
train = pd.read_csv('../data/train.csv', parse_dates = ['Date'])
test = pd.read_csv('../data/test.csv', parse_dates = ['Date'])

  interactivity=interactivity, compiler=compiler, result=result)


## Data Pre-Processing and Feature Extraction

In [2]:
def process(input_data, store_data, max_comp_distance=100000, sort_by=None):
    
    # Create a copy of the data
    data = input_data.copy()
    
    if sort_by:
        data.sort_values(by=sort_by, inplace=True)
    
    # Merge the Store information to the data
    data = data.merge(store_data, on='Store')
    # data.drop(['Store'], axis=1, inplace=True)
    
    # Process the Date field
    data['year'] = data.Date.apply(lambda x: x.year)
    data['month'] = data.Date.apply(lambda x: x.month)
    data['day'] = data.Date.apply(lambda x: x.day)
    data['woy'] = data.Date.apply(lambda x: x.weekofyear)
    data.drop(['Date'], axis = 1, inplace=True)
    
    # Normalize Competition Distance
    data['CompetitionDistance'] = data.CompetitionDistance.fillna(max_comp_distance)
    
    # Process the Competition Open fields
    # data['CompetitionOpen'] = 12 * (data.year - data.CompetitionOpenSinceYear) + (data.month - data.CompetitionOpenSinceMonth)
    # data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis=1, inplace=True)
    
    # Process the Promo Open field
    # data['PromoOpen'] = 12 * (data.year - data.Promo2SinceYear) + (data.woy - data.Promo2SinceWeek) / float(4)
    # data['PromoOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis=1, inplace=True)
    
    # Normalize State Holiday field
    data['StateHoliday'] = data.StateHoliday.apply(lambda x: x if x in ['a', 'b', 'c'] else 0)
    
    # Dummy Coding
    for dummy in ['StateHoliday', 'StoreType', 'Assortment']:
        # Create dummy columns
        data = pd.get_dummies(data, columns=[dummy])
        
        # Remove original column
        if dummy in data.columns:
            data.drop([dummy], axis=1, inplace=True)
    
    # Fix State Holiday columns, some values are not present in the testing data
    for col in ['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']:
        if col not in data.columns:
            data[col] = np.zeros(len(data.index))
    
    # Drop unused Columns
    data.drop(['PromoInterval'], axis=1, inplace=True)
    
    # Make sure columns are sorted
    data = data.reindex_axis(sorted(data.columns), axis=1)
    
    # training data
    if 'Sales' in data.columns:
        
        # Remove NaN values
        data.fillna(0, inplace=True)
    
        # Consider only open stores for training. Closed stores wont count into the score
        data = data[data.Open != 0]
    
        # Use only Sales bigger then zero
        data = data[data.Sales > 0]

        return data.drop(['Sales', 'Customers'], axis=1), data.Sales
    
    # testing data
    else:
        # Remove NaN values
        data.Open.fillna(1, inplace=True)
        data.fillna(0, inplace=True)
        
        return data.drop(['Id'], axis=1),

In [3]:
X_train, y_train = process(train, stores)

## Regression - Cross Validation

In [None]:
from sklearn import cross_validation as cv
from sklearn.ensemble import RandomForestRegressor

# Random Forest Classifier
clf = RandomForestRegressor(n_estimators=100, random_state=42)

folds = cv.KFold(n=len(y_train), n_folds=4, shuffle=True, random_state=42)
scores = cv.cross_val_score(clf, X_train.values, y_train.values, scoring=utils.rmspe_scorer, cv=folds, n_jobs=N_PROC)
scores.mean()

In [11]:
from sklearn import cross_validation as cv
from sklearn.neighbors import KNeighborsRegressor

# KNN Classifier
clf = KNeighborsRegressor(n_neighbors=5)

folds = cv.KFold(n=len(y_train), n_folds=4, shuffle=True, random_state=42)
scores = cv.cross_val_score(clf, X_train.values, y_train.values, scoring=utils.rmspe_scorer, cv=folds, n_jobs=N_PROC)
scores.mean()

-0.30693382770253269

## Regression - Full Training

In [10]:
X_test, = process(test, stores)

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Classifier
clf = RandomForestRegressor(n_estimators=30)

clf.fit(X_train.values, y_train.values)
y_pred = clf.predict(X_test.values)

In [9]:
import time, datetime

result = pd.DataFrame({'Id': test['Id'], 'Sales': y_pred})

st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
result.to_csv('submission_%s.csv' % st, index=False)

In [18]:
y_train.values[0:30]

array([5263, 5020, 4782, 5011, 6102,    0, 4364, 3706, 3769, 3464, 3558,
       4395,    0, 4406, 4852, 4427, 4767, 5042, 5054,    0, 3530, 3808,
       3897, 3797, 3650, 4359,    0, 4797, 4665, 5558])

In [20]:
X_train

Unnamed: 0,Assortment_a,Assortment_b,Assortment_c,CompetitionDistance,DayOfWeek,Open,Promo,Promo2,SchoolHoliday,StateHoliday_0,...,StateHoliday_c,Store,StoreType_a,StoreType_b,StoreType_c,StoreType_d,day,month,woy,year
0,1,0,0,1270,5,1,1,0,1,1,...,0,1,0,0,1,0,31,7,31,2015
1,1,0,0,1270,4,1,1,0,1,1,...,0,1,0,0,1,0,30,7,31,2015
2,1,0,0,1270,3,1,1,0,1,1,...,0,1,0,0,1,0,29,7,31,2015
3,1,0,0,1270,2,1,1,0,1,1,...,0,1,0,0,1,0,28,7,31,2015
4,1,0,0,1270,1,1,1,0,1,1,...,0,1,0,0,1,0,27,7,31,2015
5,1,0,0,1270,7,0,0,0,0,1,...,0,1,0,0,1,0,26,7,30,2015
6,1,0,0,1270,6,1,0,0,0,1,...,0,1,0,0,1,0,25,7,30,2015
7,1,0,0,1270,5,1,0,0,0,1,...,0,1,0,0,1,0,24,7,30,2015
8,1,0,0,1270,4,1,0,0,0,1,...,0,1,0,0,1,0,23,7,30,2015
9,1,0,0,1270,3,1,0,0,0,1,...,0,1,0,0,1,0,22,7,30,2015
