In [1]:
import numpy as np
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
import sklearn.gaussian_process.kernels as K

import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
n = 280
h = 28
start = 1969 - h - n
end = 1969

## Training data

In [3]:
tr_last = 1913
max_lags = 57 
start_day = 350

numcols = [str(day) for day in range(start_day,tr_last+1)]

dtype = {numcol:"float32" for numcol in numcols} 

train = pd.read_csv("m5-forecasting-accuracy/sales_train_evaluation.csv", dtype = dtype)

In [4]:
ids = train.loc[:, 'id'].to_frame()
#print(ids.shape)

useless_slice = train.loc[:, 'd_1914':'d_1941']
#print(useless_slice.shape)

df = pd.concat([ids, useless_slice], axis = 1)
#print(df.shape)

## Calendar 

In [5]:
CAL_DTYPES={"weekday": "category", 
            'wm_yr_wk': 'int16', 
            "wday": "int16",
            "month": "int16", 
            "year": "int16", 
            "event_name_1": "category", 
            "event_name_2": "category", 
            "event_type_1": "category", 
            "event_type_2": "category", 
            "snap_CA": "float32", 
            'snap_TX': 'float32', 
            'snap_WI': 'float32'}

calendar = pd.read_csv("m5-forecasting-accuracy/calendar.csv", dtype = CAL_DTYPES)

calendar["date"] = pd.to_datetime(calendar["date"]) # this changes the format of the 'date' column to handier one

for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            calendar[col] = calendar[col].cat.codes.astype("int16") # changes category to int16! so now Boolean
            calendar[col] -= calendar[col].min() # this changes the -1 values in the event_name and _type columns to 0


events = calendar.loc[:, 'event_name_1':'event_type_2'][start:end].values.tolist()

In [6]:
#print(len(events))

In [7]:
snap_CA = [ [x] for x in calendar.loc[:, 'snap_CA'][start:end].tolist()]
snap_TX = [ [x] for x in calendar.loc[:, 'snap_TX'][start:end].tolist()]
snap_WI = [ [x] for x in calendar.loc[:, 'snap_WI'][start:end].tolist()]

In [8]:
#print(len(snap_CA))

# Second 30490 rows of submission

In [9]:
training_data = train.loc[:, 'd_1662':'d_1941']
ids = train.loc[:, 'id']
store_ids = train.loc[:, 'store_id']

In [10]:
kernel = K.RBF() + K.ExpSineSquared() * K.RationalQuadratic() 

In [11]:
matrix = []

for i in range(11):
    t1 = time.time()
    
    x = [ [i] for i in range(start, end)]
    x = [x + y for x, y in zip(x, events)]
    store_id = store_ids.loc[0]

    if store_id in ['CA_1', 'CA_2', 'CA_3', 'CA_4']:
        x = [x + y for x, y in zip(x, snap_CA)]
    elif store_id in ['TX_1', 'TX_2', 'TX_3']:
        x = [x + y for x, y in zip(x, snap_TX)]
    elif store_id in ['WI_1', 'WI_2', 'WI_3']:
        x = [x + y for x, y in zip(x, snap_WI)]
    
    X = x[:280]
    
    Y = training_data.loc[i, :].tolist()
    
    
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

    gp.fit(X, Y)

    Y_pred = gp.predict(x, return_std=False)
    
    predictions = Y_pred[280:]
    row = [ids[i]] + predictions.tolist()

    matrix.append(row)
    t2 = time.time()

In [12]:
frame = pd.DataFrame(matrix, columns = ["id"] + ["F" + str(i) for i in range (1, 29)])
frame.shape

(11, 29)

## Writing dataframe to .xlsx

### First part:

In [13]:
print(df.shape)

columns = ["id"] + ["F" + str(i) for i in range (1, 29)]
df.columns = columns

(30490, 29)


### Combining:

In [14]:
df = pd.concat([df, frame])

df.shape

(30501, 29)

In [15]:
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, index = False)
writer.save()

print('DataFrame is written successfully to Excel File.')

DataFrame is written successfully to Excel File.
