In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns

# Importing, Cleaning, & Preprocessing Training Data

In [2]:
# Importing Data
data=pd.read_csv('train.csv', sep=',')

In [3]:
# Cleaning
data[['country','store','product']]=data[['country','store','product']].astype('category')
data['date']=pd.to_datetime(data['date'])

In [4]:
#Preprocessing
## Creating Day of week, month, year, day of year, week of year columns. 
data['dayofweek']=data['date'].dt.weekday
data['month']=data['date'].dt.month
data['year']=data['date'].dt.year
data['dayofyear']=data['date'].dt.dayofyear
data['weekofyear']=data['date'].dt.isocalendar().week
data['weekofyear']=data['weekofyear'].astype('int')
data['date']=data['date'].astype('int')



In [5]:
## Replacing NaN numbers sold with 0. 
data['num_sold']=data['num_sold'].fillna(0)

In [6]:
# Creating Time Series Split for Cross Validation
tscv=TimeSeriesSplit(n_splits=5)
for train_index, validation_index in tscv.split(data):
    train = data.iloc[train_index]    
    validation = data.iloc[validation_index]
# Creating () & () for training & validation set
x_train = train[['date','country', 'store', 'product', 'dayofweek', 'month', 'year', 'dayofyear', 'weekofyear']]
y_train = train['num_sold']
x_validation=validation[['date','country', 'store', 'product', 'dayofweek', 'month', 'year', 'dayofyear', 'weekofyear']]
y_validation = validation['num_sold']


In [7]:
## Converting data over to Dmatrix, so categorical data can be used. 
dtrain=xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
dvalid=xgb.DMatrix(x_validation, label=y_validation, enable_categorical=True)

# Model Creating

In [8]:
# setting up parameters to be used in model
params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',  # Use 'reg:squarederror' instead of 'reg:linear' (deprecated)
    'max_depth': 3,
    'learning_rate': 0.01,
    'base_score': 0.5,
    'eval_metric': 'mape',  # Metric for evaluation
    'enable_categorical': True,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [9]:
# Creating Model
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    early_stopping_rounds=50,
    evals=watchlist
)

[0]	train-mape:inf	eval-mape:inf
[1]	train-mape:inf	eval-mape:inf
[2]	train-mape:inf	eval-mape:inf
[3]	train-mape:inf	eval-mape:inf
[4]	train-mape:inf	eval-mape:inf
[5]	train-mape:inf	eval-mape:inf
[6]	train-mape:inf	eval-mape:inf
[7]	train-mape:inf	eval-mape:inf
[8]	train-mape:inf	eval-mape:inf
[9]	train-mape:inf	eval-mape:inf
[10]	train-mape:inf	eval-mape:inf
[11]	train-mape:inf	eval-mape:inf
[12]	train-mape:inf	eval-mape:inf
[13]	train-mape:inf	eval-mape:inf
[14]	train-mape:inf	eval-mape:inf
[15]	train-mape:inf	eval-mape:inf
[16]	train-mape:inf	eval-mape:inf
[17]	train-mape:inf	eval-mape:inf
[18]	train-mape:inf	eval-mape:inf
[19]	train-mape:inf	eval-mape:inf
[20]	train-mape:inf	eval-mape:inf
[21]	train-mape:inf	eval-mape:inf
[22]	train-mape:inf	eval-mape:inf
[23]	train-mape:inf	eval-mape:inf
[24]	train-mape:inf	eval-mape:inf
[25]	train-mape:inf	eval-mape:inf
[26]	train-mape:inf	eval-mape:inf
[27]	train-mape:inf	eval-mape:inf
[28]	train-mape:inf	eval-mape:inf
[29]	train-mape:inf	eval

Parameters: { "enable_categorical" } are not used.



[41]	train-mape:inf	eval-mape:inf
[42]	train-mape:inf	eval-mape:inf
[43]	train-mape:inf	eval-mape:inf
[44]	train-mape:inf	eval-mape:inf
[45]	train-mape:inf	eval-mape:inf
[46]	train-mape:inf	eval-mape:inf
[47]	train-mape:inf	eval-mape:inf
[48]	train-mape:inf	eval-mape:inf
[49]	train-mape:inf	eval-mape:inf
[50]	train-mape:inf	eval-mape:inf


In [10]:
# Creating Submission
test=pd.read_csv('test.csv', sep=',')

## Cleaning
test[['country','store','product']]=test[['country','store','product']].astype('category')
test['date']=pd.to_datetime(test['date'])

## Creating Day of week, month, year, day of year, week of year columns. 
test['dayofweek']=test['date'].dt.weekday
test['month']=test['date'].dt.month
test['year']=test['date'].dt.year
test['dayofyear']=test['date'].dt.dayofyear
test['weekofyear']=test['date'].dt.isocalendar().week
test['weekofyear']=test['weekofyear'].astype('int')
test['date']=test['date'].astype('int')

## Preprocessing
data2test=test[['date','country', 'store', 'product', 'dayofweek', 'month', 'year', 'dayofyear', 'weekofyear']]

Eval = xgb.DMatrix(data2test, enable_categorical=True)

## Creating Predictions
predict = model.predict(Eval)

## Submission Dataframe
submission= pd.DataFrame({'id':test['id'],
                     'num_sold': predict
                    })
submission=submission.set_index('id')

## Submission File
submission.to_csv('submission.csv')

