# Training of Model

## Import libraries

In [74]:
import pandas as pd

## Load data

In [75]:
trainD = pd.read_csv('Data/train.csv', parse_dates=['date'])
testD = pd.read_csv('Data/test.csv', parse_dates=['date'])
isHoliday = pd.read_csv('Data/holidays_events.csv', parse_dates=['date'])

In [76]:
trainD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [77]:
isHoliday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


## Filters Earthquake dates

In [78]:
trainDWOEarthQuake = trainD[(trainD['date'] < '2016-04-16') | (trainD['date'] >= '2016-07-16')]

## Adds Holiday column

In [79]:
#Drops the rows related to holidays that were transferred
isHoliday = isHoliday.drop(isHoliday[isHoliday.transferred == True].index)

In [7]:
#New column which assigns 1 if the date is holiday or 0 if is not.
trainDWOEarthQuake = trainDWOEarthQuake.assign(is_holiday=0)

#Makes a list with the ordered dates in holidays
isHoliday = isHoliday.sort_values('date',ascending=True)
holiday_dates = isHoliday['date'].to_list()

i = 0

for index, row in trainDWOEarthQuake.iterrows():
    if row['date'].strftime('%Y-%m-%d') > holiday_dates[i].strftime('%Y-%m-%d'):
        i+=1
    if row['date'].strftime('%Y-%m-%d') == holiday_dates[i].strftime('%Y-%m-%d') and i < len(holiday_dates):
        trainDWOEarthQuake['is_holiday'].loc[index]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Adds store "type" column

In [80]:
stores = pd.read_csv('Data/stores.csv')
stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   store_nbr  54 non-null     int64 
 1   city       54 non-null     object
 2   state      54 non-null     object
 3   type       54 non-null     object
 4   cluster    54 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.2+ KB


In [81]:
def type_fun(row):  
    return stores['type'].loc[stores['store_nbr']==row['store_nbr']].to_list()[0]


#stores.loc[stores['store_nbr']==trainDWOEarthQuake.loc[trainDWOEarthQuake['id']==0]['store_nbr']]
#stores['type'].loc[stores['store_nbr']==trainDWOEarthQuake.loc[trainDWOEarthQuake['id']==0]['store_nbr'].to_list()[0]]
trainDWOEarthQuake['type']=trainDWOEarthQuake.apply(lambda row:type_fun(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDWOEarthQuake['type']=trainDWOEarthQuake.apply(lambda row:type_fun(row),axis=1)


In [82]:
trainDWOEarthQuake.head(30)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,type
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,D
1,1,2013-01-01,1,BABY CARE,0.0,0,D
2,2,2013-01-01,1,BEAUTY,0.0,0,D
3,3,2013-01-01,1,BEVERAGES,0.0,0,D
4,4,2013-01-01,1,BOOKS,0.0,0,D
5,5,2013-01-01,1,BREAD/BAKERY,0.0,0,D
6,6,2013-01-01,1,CELEBRATION,0.0,0,D
7,7,2013-01-01,1,CLEANING,0.0,0,D
8,8,2013-01-01,1,DAIRY,0.0,0,D
9,9,2013-01-01,1,DELI,0.0,0,D


## Encoding

In [8]:
#Categorical encoding for variables with categories

#family
Xaux = trainDWOEarthQuake['family'].astype('category').cat.codes
trainDWOEarthQuake['family'] = Xaux

In [25]:
#New column day of year type int.  (Month-1)*31 + day
trainDWOEarthQuake['month']=trainDWOEarthQuake.apply(lambda row: (int(row['date'].strftime('%m'))-1)*31 + int(row['date'].strftime('%d')),axis=1)


## Final training (X) and result (y) data

In [55]:
y = trainDWOEarthQuake['sales']

X = trainDWOEarthQuake.drop(['id','date','sales','month','is_holiday'],axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2838726 entries, 0 to 3000887
Data columns (total 4 columns):
 #   Column       Dtype
---  ------       -----
 0   store_nbr    int64
 1   family       int8 
 2   onpromotion  int64
 3   is_holiday   int64
dtypes: int64(3), int8(1)
memory usage: 89.3 MB


## Training model

In [56]:
# Min max normalization.  
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)


#Split estratificado 80-20 entre datos de entreanmiento y de testeo
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

In [57]:
#MLP Regressor
#from sklearn.neural_network import MLPRegressor
#regr = MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',activation='logistic').fit(X_train, y_train)

#Gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
regr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, 
                                 random_state=0,loss='squared_error').fit(X_train, y_train)

In [58]:
#Test
from sklearn.metrics import mean_squared_log_error
#regr.score(X_test, y_test)
y_pred = regr.predict(X_test)
print(y_test,y_pred)

for i in range(0,len(y_pred)):
    if y_pred[i] < 0:
        y_pred[i]=0

print(mean_squared_log_error(y_test,y_pred))
print(regr.feature_importances_)

1702631      99.466
2062195     962.000
2359340     521.000
515435        0.000
2347645     459.000
             ...   
738129        0.000
1068217    1271.000
1788578      60.471
2935765       2.000
1618553       0.000
Name: sales, Length: 567746, dtype: float64 [ 34.45078257 253.86437109 860.74668377 ... 337.0813111   -5.78959528
 -72.27787151]
4.792218220177323
[1.50979536e-01 5.04060592e-01 3.44925934e-01 3.39376800e-05]


## Results
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1,                            random_state=0,loss='squared_error')  - 11.481437551723506
- GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=1,random_state=0,loss='squared_error') - Negative values
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, max_depth=1,random_state=0,loss='squared_error') - 10.742646071889427
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, max_depth=1,                            random_state=0,loss='squared_error').fit(X_train, y_train) -  10.513907932848417.  store_nbr, family, and discount  
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.2,random_state=0,loss='squared_error') - 4.851900010933088
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,random_state=0,loss='squared_error') - 5.314085562035126
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.5, random_state=0,loss='squared_error') - 5.035771953379746
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, random_state=0,loss='squared_error') - 4.970273218735189
- store_nbr, family, discount, and is_holiday. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, random_state=0,loss='squared_error') - 4.970273218735189
