# Training of Model

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Load data

In [2]:
trainD = pd.read_csv('Data/train.csv', parse_dates=['date'])
testD = pd.read_csv('Data/test.csv', parse_dates=['date'])
isHoliday = pd.read_csv('Data/holidays_events.csv', parse_dates=['date'])

In [3]:
trainD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [4]:
isHoliday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


## Filters Earthquake dates

In [5]:
trainDWOEarthQuake = trainD[(trainD['date'] < '2016-04-16') | (trainD['date'] >= '2016-07-16')]

## Adds Holiday column - Dropped as is not taken into account in the training

In [6]:
#Drops the rows related to holidays that were transferred
isHoliday = isHoliday.drop(isHoliday[isHoliday.transferred == True].index)

In [7]:
#New column which assigns 1 if the date is holiday or 0 if is not.
#trainDWOEarthQuake = trainDWOEarthQuake.assign(is_holiday=0)

#Makes a list with the ordered dates in holidays
#isHoliday = isHoliday.sort_values('date',ascending=True)
#holiday_dates = isHoliday['date'].to_list()

#i = 0

#for index, row in trainDWOEarthQuake.iterrows():
#    if row['date'].strftime('%Y-%m-%d') > holiday_dates[i].strftime('%Y-%m-%d'):
#        i+=1
#    if row['date'].strftime('%Y-%m-%d') == holiday_dates[i].strftime('%Y-%m-%d') and i < len(holiday_dates):
#        trainDWOEarthQuake['is_holiday'].loc[index]=1

## Adds store "type" column

In [8]:
stores = pd.read_csv('Data/stores.csv')
stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   store_nbr  54 non-null     int64 
 1   city       54 non-null     object
 2   state      54 non-null     object
 3   type       54 non-null     object
 4   cluster    54 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.2+ KB


In [9]:
trainDWOEarthQuake['type']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['type'].to_dict())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDWOEarthQuake['type']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['type'].to_dict())


## Adds store "city" column

In [10]:
trainDWOEarthQuake['city']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['city'].to_dict())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDWOEarthQuake['city']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['city'].to_dict())


## Adds "cluster"

In [11]:
trainDWOEarthQuake['cluster']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['cluster'].to_dict())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDWOEarthQuake['cluster']=trainDWOEarthQuake['store_nbr'].map(stores.set_index('store_nbr')['cluster'].to_dict())


## Adds "oil"

In [12]:
oil = pd.read_csv('Data/oil.csv',parse_dates=['date'])
oil = oil.fillna(method='ffill')
oil = oil.fillna(method='bfill')
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        1218 non-null   datetime64[ns]
 1   dcoilwtico  1218 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.2 KB


In [13]:
trainDWOEarthQuake['oil']=trainDWOEarthQuake['date'].map(oil.set_index('date')['dcoilwtico'].to_dict())        
trainDWOEarthQuake = trainDWOEarthQuake.fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDWOEarthQuake['oil']=trainDWOEarthQuake['date'].map(oil.set_index('date')['dcoilwtico'].to_dict())


In [14]:
trainDWOEarthQuake.info()
print(trainDWOEarthQuake.isnull().sum())
trainDWOEarthQuake.tail(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2838726 entries, 0 to 3000887
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   type         object        
 7   city         object        
 8   cluster      int64         
 9   oil          float64       
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 238.2+ MB
id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
type           0
city           0
cluster        0
oil            0
dtype: int64


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,type,city,cluster,oil
3000878,3000878,2017-08-15,9,MAGAZINES,11.0,0,B,Quito,6,47.57
3000879,3000879,2017-08-15,9,MEATS,449.228,0,B,Quito,6,47.57
3000880,3000880,2017-08-15,9,PERSONAL CARE,522.0,11,B,Quito,6,47.57
3000881,3000881,2017-08-15,9,PET SUPPLIES,6.0,0,B,Quito,6,47.57
3000882,3000882,2017-08-15,9,PLAYERS AND ELECTRONICS,6.0,0,B,Quito,6,47.57
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,B,Quito,6,47.57
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,B,Quito,6,47.57
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,B,Quito,6,47.57
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8,B,Quito,6,47.57
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0,B,Quito,6,47.57


In [15]:
#trainDWOEarthQuake.to_csv('train_sliced.csv')

## Encoding

In [16]:
#Categorical encoding for variables with categories

#family
Xaux = trainDWOEarthQuake['family'].astype('category').cat.codes
trainDWOEarthQuake['family'] = Xaux

#type
Xaux = trainDWOEarthQuake['type'].astype('category').cat.codes
trainDWOEarthQuake['type'] = Xaux

#city
Xaux = trainDWOEarthQuake['city'].astype('category').cat.codes
trainDWOEarthQuake['city'] = Xaux

In [17]:
#New column day of year type int.  (Month-1)*31 + day
#trainDWOEarthQuake['month']=trainDWOEarthQuake.apply(lambda row: (int(row['date'].strftime('%m'))-1)*31 + int(row['date'].strftime('%d')),axis=1)


## Final training (X) and result (y) data

In [19]:
y = trainDWOEarthQuake['sales']
X = trainDWOEarthQuake.drop(['id','date','sales'],axis=1)

# X = trainDWOEarthQuake.drop(['id','date'],axis=1)
# X_aux = X.loc[X['family']==1]
# y = X_aux['sales']
# X = X_aux.drop(['family','sales'],axis=1)

## Training model

In [20]:
# Min max normalization.  
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)


#Split estratificado 80-20 entre datos de entreanmiento y de testeo
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

#y_train = y_train.to_numpy()
#X_train = np.concatenate((X_train, y_train[:, None]), axis=1)
#X_train = pd.DataFrame(X_train, columns = ['store_nbr', 'onpromotion', 'type', 'city', 'cluster', 'oil','y'])
#X_train

In [21]:
#MLP Regressor
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',activation='logistic').fit(X_train, y_train)

#Gradient boosting regressor
#from sklearn.ensemble import GradientBoostingRegressor
#regr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, 
#                                random_state=0,loss='squared_error').fit(X_train, y_train)

#Auto arima
# from statsforecast.core import StatsForecast
# from statsforecast.models import auto_arima

# models = [(auto_arima, 1)]

# fcst = StatsForecast(
#      X_train, #your data 
#      models=[auto_arima], 
#      freq='D', # frequency of your data
#      n_jobs=7, # you can also define the number of cores used for parallelizing
# )




In [22]:
#Test
from sklearn.metrics import mean_squared_log_error
#regr.score(X_test, y_test)
y_pred = regr.predict(X_test)
print(y_test,y_pred)

for i in range(0,len(y_pred)):
    if y_pred[i] < 0:
        y_pred[i]=0

print(mean_squared_log_error(y_test,y_pred))
print(regr.feature_importances_)

1737999       9.000
671127       10.000
25404         0.000
1165714      83.000
450993        0.000
             ...   
2570631    1207.568
464156        7.000
31743         0.000
748471        0.000
1527250     254.000
Name: sales, Length: 567746, dtype: float64 [ 95.0497868  407.69888751  19.92911649 ... 571.59699995 151.8986317
 333.78432587]
7.379342660020622


AttributeError: 'MLPRegressor' object has no attribute 'feature_importances_'

## Results
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1,                            random_state=0,loss='squared_error')  - 11.481437551723506
- GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=1,random_state=0,loss='squared_error') - Negative values
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, max_depth=1,random_state=0,loss='squared_error') - 10.742646071889427
- GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, max_depth=1,                            random_state=0,loss='squared_error').fit(X_train, y_train) -  10.513907932848417.  store_nbr, family, and discount  
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.2,random_state=0,loss='squared_error') - 4.851900010933088
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,random_state=0,loss='squared_error') - 5.314085562035126
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.5, random_state=0,loss='squared_error') - 5.035771953379746
- store_nbr, family, and discount. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, random_state=0,loss='squared_error') - 4.970273218735189
- store_nbr, family, discount, and is_holiday. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, random_state=0,loss='squared_error') - 4.970273218735189
- store_nbr, family, discount, and type. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, random_state=0,loss='squared_error') - 4.641656860905568
- store_nbr, family, discount, type, city, cluster. Negative values are taken as zero. GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, random_state=0,loss='squared_error') - 4.979573595403314
- store_nbr, family, discount, type, city, cluster. Negative values are taken as zero. MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',activation='logistic') - 4.979573595403314
