# Training of Model

## Import libraries

In [1]:
import pandas as pd

## Load data

In [2]:
trainD = pd.read_csv('Data/train.csv', parse_dates=['date'])
testD = pd.read_csv('Data/test.csv', parse_dates=['date'])
isHoliday = pd.read_csv('Data/holidays_events.csv', parse_dates=['date'])

In [3]:
trainD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [4]:
isHoliday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


## Filters Earthquake dates

In [5]:
trainDWOEarthQuake = trainD[(trainD['date'] < '2016-04-16') | (trainD['date'] >= '2016-07-16')]

## Adds Holiday column

In [6]:
#Drops the rows related to holidays that were transferred
isHoliday = isHoliday.drop(isHoliday[isHoliday.transferred == True].index)

In [7]:
#New column which assigns 1 if the date is holiday or 0 if is not.
trainDWOEarthQuake = trainDWOEarthQuake.assign(is_holiday=0)

#Makes a list with the ordered dates in holidays
isHoliday = isHoliday.sort_values('date',ascending=True)
holiday_dates = isHoliday['date'].to_list()

i = 0

for index, row in trainDWOEarthQuake.iterrows():
    if row['date'].strftime('%Y-%m-%d') > holiday_dates[i].strftime('%Y-%m-%d'):
        i+=1
    if row['date'].strftime('%Y-%m-%d') == holiday_dates[i].strftime('%Y-%m-%d') and i < len(holiday_dates):
        trainDWOEarthQuake['is_holiday'].loc[index]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Encoding

In [8]:
#Categorical encoding for variables with categories

#family
Xaux = trainDWOEarthQuake['family'].astype('category').cat.codes
trainDWOEarthQuake['family'] = Xaux

In [9]:
#New column month of sale type int
trainDWOEarthQuake['month']=trainDWOEarthQuake.apply(lambda row: int(row['date'].strftime('%m')),axis=1)


## Final training (X) and result (y) data

In [10]:
y = trainDWOEarthQuake['sales']

X = trainDWOEarthQuake.drop(['date','sales'],axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2838726 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype
---  ------       -----
 0   id           int64
 1   store_nbr    int64
 2   family       int8 
 3   onpromotion  int64
 4   is_holiday   int64
 5   month        int64
dtypes: int64(5), int8(1)
memory usage: 132.7 MB


## Training model

In [11]:
# Min max normalization.  
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)


#Split estratificado 80-20 entre datos de entreanmiento y de testeo
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 




#MLP Regressor
#from sklearn.neural_network import MLPRegressor
#regr = MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',activation='logistic').fit(X_train, y_train)

#Gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
regr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, 
                                 random_state=0,loss='squared_error').fit(X_train, y_train)

In [12]:
#Test
from sklearn.metrics import mean_squared_log_error
#regr.score(X_test, y_test)
print(regr.predict(X_test))
print(y_test)
mean_squared_log_error(y_test,y_pred)

[1334.81130389  338.14187636  233.4443923  ... 1080.30504704   93.26034225
  338.14187636]
2648060     841.000
385345        0.000
1416524       4.000
2002569    6974.713
353888       95.009
             ...   
486856     1095.000
1324788     478.000
1796985    6359.000
394144      117.000
1416963     626.000
Name: sales, Length: 567746, dtype: float64
