In [39]:
import pickle
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from typing import List


In [105]:
ross_df = pd.read_csv('data/train.csv',low_memory=False)
store_df = pd.read_csv('data/store.csv')
merged_df = ross_df.merge(store_df, how='left', on='Store')
df = merged_df[merged_df["Open"] == 1].copy()

# Cargando el modelo:
model_sales = pickle.load(open('src/model.pkl', 'rb'))

#cargando scaler
scaler_model = pickle.load(open('src/scaler_model.pkl', 'rb'))

#cargando encoder
encoder_model = pickle.load(open('src/encoder.pkl', 'rb'))


#funciones
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week
    
def comp_months(df):
    df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)
    
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) +  (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']
    
#variables:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month','Sales']


numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear']
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']


#def predict_sales(input_values: List[float]):
    
    
split_date(df)
comp_months(df)
promo_cols(df)

max_distance = 75860

df['CompetitionDistance'].fillna(max_distance, inplace=True)




In [106]:
df.head(3)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,Promo2Open,IsPromo2Month
0,1,5,2015-07-31,5263,555,1,1,0,1,c,...,,,,2015,7,31,31,82.0,0.0,0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,...,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,31,92.0,64.131148,1
2,3,5,2015-07-31,8314,821,1,1,0,1,a,...,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31,103.0,51.901639,1


In [107]:
df = df[input_cols]#.reset_index(drop=True)       

df.columns

Index(['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen',
       'Day', 'Month', 'Year', 'WeekOfYear', 'Promo2', 'Promo2Open',
       'IsPromo2Month', 'Sales'],
      dtype='object')

In [108]:
df.head(5)

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpen,Day,Month,Year,WeekOfYear,Promo2,Promo2Open,IsPromo2Month,Sales
0,1,5,1,0,1,c,a,1270.0,82.0,31,7,2015,31,0,0.0,0,5263
1,2,5,1,0,1,a,a,570.0,92.0,31,7,2015,31,1,64.131148,1,6064
2,3,5,1,0,1,a,a,14130.0,103.0,31,7,2015,31,1,51.901639,1,8314
3,4,5,1,0,1,c,c,620.0,70.0,31,7,2015,31,0,0.0,0,13995
4,5,5,1,0,1,a,a,29910.0,3.0,31,7,2015,31,0,0.0,0,4822


In [109]:
df.StateHoliday.unique()

array(['0', 'a', 'b', 'c'], dtype=object)

In [110]:
df.DayOfWeek.unique()

array([5, 4, 3, 2, 1, 7, 6], dtype=int64)

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844392 entries, 0 to 1017190
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Store                844392 non-null  int64  
 1   DayOfWeek            844392 non-null  int64  
 2   Promo                844392 non-null  int64  
 3   StateHoliday         844392 non-null  object 
 4   SchoolHoliday        844392 non-null  int64  
 5   StoreType            844392 non-null  object 
 6   Assortment           844392 non-null  object 
 7   CompetitionDistance  844392 non-null  float64
 8   CompetitionOpen      844392 non-null  float64
 9   Day                  844392 non-null  int64  
 10  Month                844392 non-null  int64  
 11  Year                 844392 non-null  int64  
 12  WeekOfYear           844392 non-null  UInt32 
 13  Promo2               844392 non-null  int64  
 14  Promo2Open           844392 non-null  float64
 15  IsPromo2Month   

In [87]:
print(scaler_model)

MinMaxScaler()


In [98]:
encoder_model

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [88]:
from sklearn import preprocessing

In [89]:
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
#from sklearn.preprocessing import 

In [90]:
df[numeric_cols].head(3)

Unnamed: 0,Store,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpen,Promo2,Promo2Open,IsPromo2Month,Day,Month,Year,WeekOfYear
0,1,1,1,1270.0,82.0,0,0.0,0,31,7,2015,31
1,2,1,1,570.0,92.0,1,64.131148,1,31,7,2015,31
2,3,1,1,14130.0,103.0,1,51.901639,1,31,7,2015,31


In [91]:
df[numeric_cols].values

array([[1, 1, 1, ..., 7, 2015, 31],
       [2, 1, 1, ..., 7, 2015, 31],
       [3, 1, 1, ..., 7, 2015, 31],
       ...,
       [769, 0, 1, ..., 1, 2013, 1],
       [948, 0, 1, ..., 1, 2013, 1],
       [1097, 0, 1, ..., 1, 2013, 1]], dtype=object)

In [71]:
#from sklearn import preprocessing
#scaler_model.transform(df[numeric_cols])

array([[ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         5.45454545e-01,  1.00000000e+00,  5.88235294e-01],
       [ 8.97666068e-04,  1.00000000e+00,  1.00000000e+00, ...,
         5.45454545e-01,  1.00000000e+00,  5.88235294e-01],
       [ 1.79533214e-03,  1.00000000e+00,  1.00000000e+00, ...,
         5.45454545e-01,  1.00000000e+00,  5.88235294e-01],
       ...,
       [ 6.89407540e-01,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00, -1.00000000e+00,  0.00000000e+00],
       [ 8.50089767e-01,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00, -1.00000000e+00,  0.00000000e+00],
       [ 9.83842011e-01,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00, -1.00000000e+00,  0.00000000e+00]])

In [92]:

df[numeric_cols] = scaler_model.transform(df[numeric_cols])

encoded_cols = list(encoder_model.get_feature_names_out(categorical_cols))
df[encoded_cols] = encoder_model.transform(df[categorical_cols])

In [93]:
df.head(3)

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpen,Day,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,0.0,5,1.0,0,1.0,c,a,0.016482,0.059163,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.000898,5,1.0,0,1.0,a,a,0.007252,0.066378,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.001795,5,1.0,0,1.0,a,a,0.18605,0.074315,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [94]:
var_final = ['Store', 'Promo', 'SchoolHoliday', 'CompetitionDistance',
       'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month', 'Day',
       'Month', 'Year', 'WeekOfYear', 'DayOfWeek_1.0', 'DayOfWeek_2.0',
       'DayOfWeek_3.0', 'DayOfWeek_4.0', 'DayOfWeek_5.0', 'DayOfWeek_6.0',
       'DayOfWeek_7.0', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b',
       'StateHoliday_c', 'StoreType_a', 'StoreType_b', 'StoreType_c',
       'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c']

In [95]:
df['preds'] = model_sales.predict(df[var_final])
df.head(3)

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpen,Day,...,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,preds
0,0.0,5,1.0,0,1.0,c,a,0.016482,0.059163,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,5466.051758
1,0.000898,5,1.0,0,1.0,a,a,0.007252,0.066378,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6043.572754
2,0.001795,5,1.0,0,1.0,a,a,0.18605,0.074315,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8324.673828


In [101]:
df[var_final].head(3)

Unnamed: 0,Store,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpen,Promo2,Promo2Open,IsPromo2Month,Day,Month,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,0.0,1.0,1.0,0.016482,0.059163,0.0,0.0,0.0,1.0,0.545455,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.000898,1.0,1.0,0.007252,0.066378,1.0,0.89071,1.0,1.0,0.545455,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.001795,1.0,1.0,0.18605,0.074315,1.0,0.720856,1.0,1.0,0.545455,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [102]:
df[['Promo2Open', 'IsPromo2Month', 'Day',
       'Month', 'Year', 'WeekOfYear', 'DayOfWeek_1.0', 'DayOfWeek_2.0']].head(3)

Unnamed: 0,Promo2Open,IsPromo2Month,Day,Month,Year,WeekOfYear,DayOfWeek_1.0,DayOfWeek_2.0
0,0.0,0.0,1.0,0.545455,1.0,0.588235,0.0,0.0
1,0.89071,1.0,1.0,0.545455,1.0,0.588235,0.0,0.0
2,0.720856,1.0,1.0,0.545455,1.0,0.588235,0.0,0.0


In [103]:
df[['DayOfWeek_3.0', 'DayOfWeek_4.0', 'DayOfWeek_5.0', 'DayOfWeek_6.0',
       'DayOfWeek_7.0', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b']].head(3)

Unnamed: 0,DayOfWeek_3.0,DayOfWeek_4.0,DayOfWeek_5.0,DayOfWeek_6.0,DayOfWeek_7.0,StateHoliday_0,StateHoliday_a,StateHoliday_b
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [104]:
df[['StateHoliday_c', 'StoreType_a', 'StoreType_b', 'StoreType_c',
       'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c']].head(3)

Unnamed: 0,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [96]:
df[['Sales','preds']].head(3)

Unnamed: 0,Sales,preds
0,5263,5466.051758
1,6064,6043.572754
2,8314,8324.673828


In [78]:
df.columns

Index(['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen',
       'Day', 'Month', 'Year', 'WeekOfYear', 'Promo2', 'Promo2Open',
       'IsPromo2Month', 'Sales', 'DayOfWeek_1.0', 'DayOfWeek_2.0',
       'DayOfWeek_3.0', 'DayOfWeek_4.0', 'DayOfWeek_5.0', 'DayOfWeek_6.0',
       'DayOfWeek_7.0', 'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b',
       'StateHoliday_c', 'StoreType_a', 'StoreType_b', 'StoreType_c',
       'StoreType_d', 'Assortment_a', 'Assortment_b', 'Assortment_c', 'preds'],
      dtype='object')

In [112]:
encoded_cols

['DayOfWeek_1.0',
 'DayOfWeek_2.0',
 'DayOfWeek_3.0',
 'DayOfWeek_4.0',
 'DayOfWeek_5.0',
 'DayOfWeek_6.0',
 'DayOfWeek_7.0',
 'StateHoliday_0',
 'StateHoliday_a',
 'StateHoliday_b',
 'StateHoliday_c',
 'StoreType_a',
 'StoreType_b',
 'StoreType_c',
 'StoreType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c']

In [113]:
df[categorical_cols].head(3)

Unnamed: 0,DayOfWeek,StateHoliday,StoreType,Assortment
0,5,0,c,a
1,5,0,a,a
2,5,0,a,a
