In [1]:
# https://www.kaggle.com/siavrez/simple-eda-with-croston-method

In [2]:
# imports
import pandas as pd
import numpy as np

In [3]:
# load data
sell_prices = pd.read_csv('sell_prices.csv')
calendar = pd.read_csv('calendar.csv')
sales_train_validation = pd.read_csv('sales_train_validation.csv')

# load sample submission
sample_subm = pd.read_csv('sample_submission.csv')

In [4]:
#https://medium.com/analytics-vidhya/croston-forecast-model-for-intermittent-demand-360287a17f5f
def Croston(ts, extra_periods, alpha):
    d = np.array(ts) # Transform the input into a numpy array
    cols = len(d) # Historical period length
    d = np.append(d,[np.nan]*extra_periods) # Append np.nan into the demand array to cover future periods
    
    #level (a), periodicity(p) and forecast (f)
    a,p,f = np.full((3,cols+extra_periods),np.nan)
    q = 1 #periods since last demand observation
    
    # Initialization
    first_occurence = np.argmax(d[:cols]>0)
    a[0] = d[first_occurence]
    p[0] = 1 + first_occurence
    f[0] = a[0]/p[0]
    # Create all the t+1 forecasts
    for t in range(0,cols):        
        if d[t] > 0:
            a[t+1] = alpha*d[t] + (1-alpha)*a[t] 
            p[t+1] = alpha*q + (1-alpha)*p[t]
            f[t+1] = a[t+1]/p[t+1]
            q = 1           
        else:
            a[t+1] = a[t]
            p[t+1] = p[t]
            f[t+1] = f[t]
            q += 1
       
    # Future Forecast 
    a[cols+1:cols+extra_periods] = a[cols]
    p[cols+1:cols+extra_periods] = p[cols]
    f[cols+1:cols+extra_periods] = f[cols]
                      
    df = pd.DataFrame.from_dict({"Demand":d,"Forecast":f,"Period":p,"Level":a,"Error":d-f})
    return df

In [5]:
days = range(1, 1914)
time_series_columns = [f'd_{i}' for i in days]
time_series_data = sales_train_validation[time_series_columns]

forecast_ = time_series_data.apply(lambda x : Croston(x, extra_periods=28, alpha=0.1)['Forecast'].tail(28), axis=1)

cols = ['F'+str(i+1) for i in range(28)]
forecast_.columns = cols

validation_ids = sales_train_validation['id'].values
evaluation_ids = [i.replace('validation', 'evaluation') for i in validation_ids]
ids = np.concatenate([validation_ids, evaluation_ids])
predictions = pd.DataFrame(ids, columns=['id'])
forecast = pd.concat([forecast_] * 2).reset_index(drop=True)
predictions = pd.concat([predictions, forecast], axis=1)
predictions.to_csv('submission.csv', index=False)