In [3]:
import modulo_conn_sql as mcq
import numpy as np
import pandas as pd 
import datetime 
from pandas.tseries.offsets import MonthEnd
from pandas.tseries.offsets import MonthBegin

import sqlalchemy as sa
import urllib

from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from xgboost import XGBRegressor

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from livelossplot import PlotLossesKeras
import keras.optimizers as opts

from pickle import dump
from pickle import load
from keras.models import load_model

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#Query BD SQL-Server Cemex
def querySQL(query, parametros):
    #Conectar con base sql y ejecutar consulta
    cursor = conectarSQL()
    try:
        cursor.execute(query, parametros)
        #obtener nombre de columnas
        names = [ x[0] for x in cursor.description]
        
        #Reunir todos los resultado en rows
        rows = cursor.fetchall()
        resultadoSQL = []
            
        #Hacer un array con los resultados
        while rows:
            resultadoSQL.append(rows)
            if cursor.nextset():
                rows = cursor.fetchall()
            else:
                rows = None
                
        #Redimensionar el array para que quede en dos dimensiones
        resultadoSQL = np.array(resultadoSQL)
        resultadoSQL = np.reshape(resultadoSQL, (resultadoSQL.shape[1], resultadoSQL.shape[2]) )
    finally:
            if cursor is not None:
                cursor.close()
    return pd.DataFrame(resultadoSQL, columns = names)

#SQL Methods to get operation data
def conectarSQL():
    conn = mcq.ConexionSQL()
    cursor = conn.getCursor()
    return cursor

# dataset(pandas dataframe): base de datos con el historico
# array_group_top(array) : array de la jerarquia mas ALTA con el nombre de las columnas del dataset por el cual se quiere agrupar las proporciones
# array_group_bottom(array) : array de la jerarquia mas BAJA con el nombre de las columnas del dataset por el cual se quiere agrupar las proporciones
# medida_target( string ) : nombre de la columna que contiene los datos objetivo de la proporcion
# group_target(array) : array de nombre de columnas con las cuales queremos la proporcion final
# name_proportion(string) : etiqueta de la dimension a la cual le estamos calculando la proporcion
    
def historical_proportion( dataset, array_group_top, array_group_bottom, medida_target, group_target, name_proportion  ):

    promedio_group_top = dataset.groupby(array_group_top)[medida_target].mean().reset_index()
    promedio_group_bottom = dataset.groupby(array_group_bottom)[medida_target].mean().reset_index()    
    proportion  = pd.merge(promedio_group_bottom, promedio_group_top, how = 'inner', left_on = array_group_top, right_on = array_group_top )
    proportion['proportion'] = proportion[medida_target + '_x'] / proportion[medida_target + '_y']
    proportion = proportion.groupby(group_target)['proportion'].mean().reset_index()
    proportion.rename(columns={'proportion':'proportion_' + name_proportion}, inplace = True)
    
    return proportion 




In [2]:
pais = 'Colombia'
inicioHistoria = '2014-01-01'
finHistoria = '2022-05-31'

#Consulta historico de volumen despachado diario por planta

despachosSQL = querySQL("{CALL SCAC_AP8_BaseForecast (?,?,?)}", (pais, inicioHistoria, finHistoria ) )
despachosSQL['year_month'] = despachosSQL.FechaEntrega.dt.to_period('M')
despachosSQL['totalEntregado'] = despachosSQL['totalEntregado'].astype(float)

#agrego informacion geografica        
nombre_cluster = querySQL( "SELECT Pais, Centro, Ciudad_Cluster as Ciudad, [Desc Cluster] as Cluster FROM SCAC_AT1_NombreCluster where Pais = ?" , (pais) )

despachosSQL = pd.merge(despachosSQL, nombre_cluster, left_on ='Planta', right_on='Centro', how = 'left')


### Fase1. Calculo de variables predictoras

In [3]:
def calculate_proportion(df_total, months, denominador, numerador, columna_target, grupo_final, name_column_return):
    #proporcion de Semana
    df_proportion_semana = pd.DataFrame()

    for i in df_total['year_month'].unique():

        mes =  df_total[df_total['year_month'] == i]['Mes'].unique()[0]
        año =  df_total[df_total['year_month'] == i]['Año'].unique()[0]

        df_param = df_total[(df_total['FechaEntrega'] >= datetime.datetime(año, mes , 1) - MonthBegin(months)) &  (df_total['FechaEntrega'] < datetime.datetime(año, mes , 1))]
        if len(df_param) > 0:
            df_proportion_semana_detalle = historical_proportion(df_param, denominador, numerador, columna_target, grupo_final, name_column_return)
            df_proportion_semana_detalle['Año'] = año
            df_proportion_semana_detalle['Mes'] = mes
            if len(df_proportion_semana_detalle) == 0:
                df_proportion_semana = df_proportion_semana_detalle
            else:
                df_proportion_semana = pd.concat([df_proportion_semana, df_proportion_semana_detalle])

    df_proportion_semana = df_proportion_semana.fillna(0)
    
    return df_proportion_semana.reset_index(drop=True)
    
def media_diaria(df_total, months):
    
    media_diaria_total = pd.DataFrame()

    for i in df_total['year_month'].unique():

        mes =  df_total[df_total['year_month'] == i]['Mes'].unique()[0]
        año =  df_total[df_total['year_month'] == i]['Año'].unique()[0]

        df_param = df_total[(
            df_total['FechaEntrega'] >= datetime.datetime(año, mes , 1) - MonthBegin(months)) &  
            (df_total['FechaEntrega'] < datetime.datetime(año, mes , 1))]

        media_diaria = df_param.groupby(
            [
            'Año', 
            'Mes', 
            'Planta'
            ]
        ).agg(
            {
            'totalEntregado': 'sum', 
            'DiasOperativos':'max' }
        ).reset_index()

        media_diaria['media'] = media_diaria['totalEntregado'] /  media_diaria['DiasOperativos']

        media_diaria = media_diaria.groupby(['Planta'])['media'].mean().reset_index()
        media_diaria['Año'] = año
        media_diaria['Mes'] = mes
        media_diaria.rename(columns={'media':'media'+str(months)}, inplace = True)
        
        if len(media_diaria_total) == 0:
            media_diaria_total = media_diaria
        else:
            media_diaria_total = pd.concat([media_diaria_total, media_diaria])
        
    return media_diaria_total.reset_index(drop=True)


def volumen_mes(df_total):
    
    volumen_mes = pd.DataFrame()
        
    volumen_mes = df_total.groupby(
        [
        'Año', 
        'Mes', 
        'Ciudad'
        ]
    ).agg(
        {
        'totalEntregado': 'sum' 
        }
    ).reset_index()
    
    volumen_mes.rename(columns={'totalEntregado':'volumen_ciudad'}, inplace = True)

    return volumen_mes
    

### Fase 2. construccion del dataset

In [4]:
proportion_week_1 = calculate_proportion(despachosSQL, 
                                         1, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(1))

proportion_week_2 = calculate_proportion(despachosSQL, 
                                         2, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(2))

proportion_week_3 = calculate_proportion(despachosSQL, 
                                         3, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(3))

proportion_week_6 = calculate_proportion(despachosSQL, 
                                         6, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(6))

proportion_weekday_1 = calculate_proportion(despachosSQL, 
                                         1, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(1))

proportion_weekday_2 = calculate_proportion(despachosSQL, 
                                         2, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(2))

proportion_weekday_3 = calculate_proportion(despachosSQL, 
                                         3, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(3))

proportion_weekday_6 = calculate_proportion(despachosSQL, 
                                         6, 
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(6))

media_diaria1 = media_diaria(despachosSQL, 1)
media_diaria2 = media_diaria(despachosSQL, 2)
media_diaria3 = media_diaria(despachosSQL, 3)
media_diaria6 = media_diaria(despachosSQL, 6)


volumen_ciudad_mes = volumen_mes(despachosSQL)

df = pd.merge(despachosSQL, proportion_week_1, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
df = pd.merge(df, proportion_week_2, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
df = pd.merge(df, proportion_week_3, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
df = pd.merge(df, proportion_week_6, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')

df = pd.merge(df, proportion_weekday_1, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
df = pd.merge(df, proportion_weekday_2, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
df = pd.merge(df, proportion_weekday_3, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
df = pd.merge(df, proportion_weekday_6, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')

df = pd.merge(df, media_diaria1, on=['Año', 'Mes', 'Planta'], how='left')
df = pd.merge(df, media_diaria2, on=['Año', 'Mes', 'Planta'], how='left')
df = pd.merge(df, media_diaria3, on=['Año', 'Mes', 'Planta'], how='left')
df = pd.merge(df, media_diaria6, on=['Año', 'Mes', 'Planta'], how='left')

df= pd.merge(df, volumen_ciudad_mes, on=['Año','Mes', 'Ciudad'], how='left')

df['PlantaCentral'] = np.select(
    [
        df['TipoPlanta'] == 'Central'
    ],
    [
        1
    ],default=0)

df = df.fillna(0)

columnas = ['Año', 'Mes', 'Planta','Semana_Relativa', 'DiaSemana', 
            'proportion_semana1', 'proportion_semana2',
            'proportion_semana3', 'proportion_semana6', 
            'proportion_dia_semana1','proportion_dia_semana2', 
            'proportion_dia_semana3', 'proportion_dia_semana6', 
            'media1', 'media2', 'media3', 'media6',
            'PlantaCentral', 'volumen_ciudad', 'totalEntregado']

df = df[columnas]

#### Exportacion a csv para tratamiento sin conexion a la base SQL

In [8]:
"""

df.to_csv('../datos/df_datatraining.csv')

"""

In [5]:
df.tail()

Unnamed: 0,Año,Mes,Planta,Semana_Relativa,DiaSemana,proportion_semana1,proportion_semana2,proportion_semana3,proportion_semana6,proportion_dia_semana1,proportion_dia_semana2,proportion_dia_semana3,proportion_dia_semana6,media1,media2,media3,media6,PlantaCentral,volumen_ciudad,totalEntregado
83327,2016,4,F080,2,7,0.94688,1.042885,1.042885,1.042994,0.829913,0.761935,0.852817,0.828349,299.228261,313.75413,308.780531,309.868039,1,106061.75,347.25
83328,2018,8,F093,3,3,0.949919,0.899268,0.855099,0.870402,0.979179,0.973523,1.051423,1.256665,60.663043,60.380435,51.056653,46.061905,1,820.5,8.0
83329,2018,8,FA04,4,3,0.821223,0.870196,0.930866,0.975854,1.152078,0.996258,0.916625,0.928782,147.95,162.545,167.01,180.015456,1,1949.0,19.0
83330,2016,9,FA04,2,2,2.042726,1.370647,1.161077,0.876188,0.113937,0.573909,0.393359,0.566899,61.4375,66.50625,65.004167,68.462731,1,4649.25,6.0
83331,2017,9,F020,2,3,0.887036,1.024707,1.028428,1.062503,0.890281,0.88436,0.962764,1.021829,200.33,192.19625,194.339167,192.081967,1,8042.25,264.5


### Fase 3. Definicion y entrenamiento del modelo de regresion

In [6]:
# declaracion de label encoders
le_planta = preprocessing.LabelEncoder()

# dataframe auxiliar para codificar variables categoricas
df_encoding_1 = df[['Planta']]

#aplicacion del label encoding
df_encoding_1['Planta'] = le_planta.fit_transform(df_encoding_1['Planta'])

# se guardan los label encoders
dump(le_planta, open('../label_encoders_scalers/le_planta.pkl', 'wb'))

# escalamiento de variables numericas
num_vars = ['Año', 'Mes','Semana_Relativa','DiaSemana', 'media1', 'media2', 'media3', 'media6', 'volumen_ciudad']
values_scaled = df[num_vars].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(values_scaled)

#se guarda el escalador
dump(min_max_scaler, open('../label_encoders_scalers/min_max_scaler.pkl', 'wb'))

dataTraining = pd.concat([
    df[['Planta', 'proportion_semana1', 'proportion_semana2', 'proportion_semana3',
       'proportion_semana6', 'proportion_dia_semana1',
       'proportion_dia_semana2', 'proportion_dia_semana3',
       'proportion_dia_semana6', 'PlantaCentral', 'totalEntregado']].reset_index(),
    pd.DataFrame(min_max_scaler.transform(df[num_vars].values), columns = num_vars)], 
    axis = 1, join='inner')

dataTraining['Planta'] = le_planta.transform(dataTraining['Planta'])

dataTraining = dataTraining.drop(['index'], axis = 1)

In [7]:
dataTraining

Unnamed: 0,Planta,proportion_semana1,proportion_semana2,proportion_semana3,proportion_semana6,proportion_dia_semana1,proportion_dia_semana2,proportion_dia_semana3,proportion_dia_semana6,PlantaCentral,totalEntregado,Año,Mes,Semana_Relativa,DiaSemana,media1,media2,media3,media6,volumen_ciudad
0,0,1.052822,1.126972,1.117298,1.047685,0.911646,1.001112,1.003393,1.019005,1,73.25,0.625,0.000000,0.2,0.333333,0.422626,0.450070,0.436433,0.422681,0.564952
1,0,1.052822,1.126972,1.117298,1.047685,1.222536,1.218601,1.132924,1.039406,1,333.25,0.625,0.000000,0.2,0.500000,0.422626,0.450070,0.436433,0.422681,0.564952
2,0,1.052822,1.126972,1.117298,1.047685,1.287449,1.162513,1.149606,1.243977,1,323.75,0.625,0.000000,0.2,0.833333,0.422626,0.450070,0.436433,0.422681,0.564952
3,0,1.052822,1.126972,1.117298,1.047685,0.629538,0.722859,0.788938,0.794867,1,139.00,0.625,0.000000,0.2,1.000000,0.422626,0.450070,0.436433,0.422681,0.564952
4,0,1.018406,1.046051,1.046661,1.013609,0.900623,0.863519,0.889656,0.849416,1,324.75,0.625,0.000000,0.4,0.166667,0.422626,0.450070,0.436433,0.422681,0.564952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83327,56,0.946880,1.042885,1.042885,1.042994,0.829913,0.761935,0.852817,0.828349,1,347.25,0.250,0.272727,0.2,1.000000,0.309776,0.330711,0.325786,0.331503,0.821563
83328,60,0.949919,0.899268,0.855099,0.870402,0.979179,0.973523,1.051423,1.256665,1,8.00,0.500,0.636364,0.4,0.333333,0.062801,0.063644,0.053868,0.049278,0.006267
83329,64,0.821223,0.870196,0.930866,0.975854,1.152078,0.996258,0.916625,0.928782,1,19.00,0.500,0.636364,0.6,0.333333,0.153165,0.171330,0.176208,0.192584,0.015010
83330,64,2.042726,1.370647,1.161077,0.876188,0.113937,0.573909,0.393359,0.566899,1,6.00,0.250,0.727273,0.2,0.166667,0.063603,0.070101,0.068584,0.073243,0.035928


In [9]:
# separacion variables predicotras y variable de interes
y = dataTraining['totalEntregado']
X = dataTraining.drop(['totalEntregado'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

resultados = []

### Calibración RandomForest

In [13]:
n_estimators = [100,120,150,180 ] # number of trees in the random forest, default: 100
#max_features = ['auto', 1] # number of features in consideration at every split, default: total
#max_depth = [None, 12] # maximum number of levels allowed in each decision tree, default: none
#min_samples_split = [1,2] # minimum sample number to split a node, defalut:2
#min_samples_leaf = [1,2] # minimum sample number that can be stored in a leaf node, default:2
#bootstrap = [True] # method used to sample data points, default:true

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap
              }

rf = RandomForestRegressor()


rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)

# mejores parametros
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:  7.8min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 10.1min finished


Best Parameters:  {'n_estimators': 180}  



In [20]:
modelo_rf = RandomForestRegressor(
    bootstrap= True,
    max_depth= None,
    max_features= 1,
    min_samples_leaf= 1,
    min_samples_split= 2,
    n_estimators= 120)

# fit model
modelo_rf.fit(X_train, y_train)

y_pred = modelo_rf.predict(X_test)

np.sqrt(metrics.mean_squared_error(y_test, y_pred))

68.0318302327594

In [12]:
modelo_rf = RandomForestRegressor(
    bootstrap= True,
    max_depth= None,
    max_features= 1,
    min_samples_leaf= 1,
    min_samples_split= 2,
    n_estimators= 120)

# fit model
modelo_rf.fit(X_train, y_train)

y_pred = modelo_rf.predict(X_test)

np.sqrt(metrics.mean_squared_error(y_test, y_pred))

67.67662027030994

### Calibracion XGBoost

In [21]:
param_test1 = {
    'learning_rate':np.arange(0, 0.5, 0.2),
    'gamma':range(0, 2, 1),
    'n_estimators':range(300,401,100),
    'max_depth': range(2,6,2),
    'colsample_bytree':np.arange(0, 1.1, .5)
    
}

gsearch1 = GridSearchCV(estimator = XGBRegressor(), 
 param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=-1, cv=5 )

gsearch1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    val

In [22]:
gsearch1.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0,
 'learning_rate': 0.2,
 'max_depth': 4,
 'n_estimators': 400}

In [28]:
modeloXGB = XGBRegressor(
    n_estimators = 400,
    gamma = 0,
    learning_rate = 0.2,
    max_depth = 4,
    colsample_bylevel = 1)

# fit model
modeloXGB.fit(X_train, y_train)
y_pred = modeloXGB.predict(X_test)

np.sqrt(metrics.mean_squared_error(y_test, y_pred))

66.8793796430266

In [29]:
modeloXGB = XGBRegressor()

# fit model
modeloXGB.fit(X_train, y_train)
y_pred = modeloXGB.predict(X_test)

np.sqrt(metrics.mean_squared_error(y_test, y_pred))

67.21302535286755

## Aplicacion de los modelos calibrados

In [10]:
## ENTRENAR RANDOM FOREST
modelo_rf = RandomForestRegressor(
    bootstrap= True,
    max_depth= None,
    max_features= 1,
    min_samples_leaf= 1,
    min_samples_split= 2,
    n_estimators= 120)

modelo_rf.fit(X_train, y_train)
y_pred = modelo_rf.predict(X_test)

resultados.append({'RandomForest': np.sqrt(metrics.mean_squared_error(y_test, y_pred))} )

# guardar RandomForest
dump(modelo_rf, open('../models/modelo_rf.sav', 'wb'))

## ENTRENAR XGBOOST

modeloXGB = XGBRegressor(
    n_estimators = 400,
    gamma = 0,
    learning_rate = 0.2,
    max_depth = 4,
    colsample_bylevel = 1)

# fit model
modeloXGB.fit(X_train, y_train)

y_pred = modeloXGB.predict(X_test)

resultados.append({'XGBoost': np.sqrt(metrics.mean_squared_error(y_test, y_pred)) } )

# guardar xgboost
dump(modeloXGB, open('../models/modelo_xgb.sav', 'wb'))



In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)

y_train  = np.array(y_train)
y_test = np.array(y_test)

y_test = np.reshape(y_test,( int(y_test.shape[0]), 1))
y_train = np.reshape(y_train,( int(y_train.shape[0]), 1))

output_var = y_train.shape[1]
dims = X_train.shape[1]

K.clear_session()

# Definición red neuronal con la función Sequential()
model = Sequential()

# Definición de la capa densa con un tamaño de salida igual a output_var y un input_shape de dims
model.add(Dense(500, input_shape=(dims,),activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1000, input_shape=(dims,),activation='relu'))

#model.add(Dense(20, input_shape=(dims,),activation='relu'))


model.add(Dense(output_var))
model.add(Activation('relu'))

# Definición de función de perdida. Se usa mean_squared_error dado que es un ejercicio de regresión
model.compile(optimizer='adam', loss='mean_squared_error')

# Entrenamiento de la red neuronal con n épocas
model.fit(X_train, y_train,
          validation_data = (X_test, y_test),
          epochs=100, 
          callbacks=[PlotLossesKeras()])

y_pred = model.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    


In [39]:
resultados.append({'RedNeuronal': np.sqrt(metrics.mean_squared_error(y_test, y_pred)) } )

# guardar red neuronal
model.save('../models/modelo_rn.h5')

In [40]:
resultados

[{'RandomForest': 67.53712132056702},
 {'XGBoost': 66.59520094366806},
 {'RedNeuronal': 71.85506103067247}]

## Fase 4. Testing

In [4]:
def calculate_proportion_testing(df_total, months, año, mes, denominador, numerador, columna_target, grupo_final, name_column_return):
    
    #proporcion de Semana
    df_proportion_semana = pd.DataFrame()

    df_param = df_total[(df_total['FechaEntrega'] >= datetime.datetime(año, mes , 1) - MonthBegin(months)) &  (df_total['FechaEntrega'] < datetime.datetime(año, mes , 1))]
    if len(df_param) > 0:
        df_proportion_semana_detalle = historical_proportion(df_param, denominador, numerador, columna_target, grupo_final, name_column_return)
        df_proportion_semana_detalle['Año'] = año
        df_proportion_semana_detalle['Mes'] = mes
        
        
        df_proportion_semana = df_proportion_semana_detalle.copy()
        

    df_proportion_semana = df_proportion_semana.fillna(0)
    
    return df_proportion_semana.reset_index(drop=True)

def media_diaria_testing(df_total, months, año, mes):
    
    media_diaria_total = pd.DataFrame()

    df_param = df_total[(
        df_total['FechaEntrega'] >= datetime.datetime(año, mes , 1) - MonthBegin(months)) &  
        (df_total['FechaEntrega'] < datetime.datetime(año, mes , 1))]

    media_diaria = df_param.groupby(
        [
        'Año', 
        'Mes', 
        'Planta'
        ]
    ).agg(
        {
        'totalEntregado': 'sum', 
        'DiasOperativos':'max' }
    ).reset_index()

    media_diaria['media'] = media_diaria['totalEntregado'] /  media_diaria['DiasOperativos']

    media_diaria = media_diaria.groupby(['Planta'])['media'].mean().reset_index()
    media_diaria['Año'] = año
    media_diaria['Mes'] = mes
    media_diaria.rename(columns={'media':'media'+str(months)}, inplace = True)

    media_diaria_total = media_diaria.copy()
        
    return media_diaria_total.reset_index(drop=True)

def volumen_mes_testing(df_total, año, mes, fecha_ref):
    
    volumen_mes = pd.DataFrame()
        
    volumen_mes = df_total[df_total['FechaEntrega'] >=  fecha_ref - datetime.timedelta(30) ].groupby(
        [ 
        'Ciudad'
        ]
    ).agg(
        {
        'totalEntregado': 'sum' 
        }
    ).reset_index()
    
    volumen_mes['Año'] = año
    volumen_mes['Mes'] = mes
    
    volumen_mes.rename(columns={'totalEntregado':'volumen_ciudad'}, inplace = True)

    return volumen_mes

In [5]:
año_target = 2022
mes_target = 6
pais = 'Colombia'

#finHistoria = datetime.datetime.today()
finHistoria = datetime.datetime(2022,5,31)
inicioHistoria = datetime.datetime(año_target, mes_target, 1 ) - MonthBegin(7)

In [6]:
#Volumen pre-establecido por ciudad
volumen_testing_ciudad = pd.read_excel('../DatosAbsorcionPlantas/ColombiaCiudad.xlsx')

df_calendario = querySQL( 
    "SELECT * FROM SCAC_AT3_DiasHabilesFuente WHERE pais = ? and [Fecha de entrega]  between ? and ? " , 
    (pais, datetime.datetime(año_target, mes_target, 1).strftime("%Y-%m-%d"), 
     (datetime.datetime(año_target, mes_target, 1) + MonthEnd(1)).strftime("%Y-%m-%d") )
)

df_calendario.rename(
    columns={'Fecha de entrega':'FechaEntrega'}, 
    inplace =True)

#cross join tabla DesagregacionPronostico y calendario
df_calendario['key'] = 1
volumen_testing_ciudad['key'] = 1
DesagregacionPronosticoPlantaDia = pd.merge(df_calendario, volumen_testing_ciudad, on = 'key').drop("key",1)
DesagregacionPronosticoPlantaDia.rename(columns={'ciudad_asignaciones':'Ciudad'}, inplace = True)
DesagregacionPronosticoPlantaDia = DesagregacionPronosticoPlantaDia.drop(['ciudad_ic'], axis = 1)

In [7]:
DesagregacionPronosticoPlantaDia

Unnamed: 0,ID,pais,Año,Mes,FechaEntrega,Días_Operativos,Días_Operativos_Acum,Dia_Semana,Semana_relativa,Semanas_mes,Total_Dias_Habiles_Mes,Ciudad,volumen
0,39915,Colombia,2022,6,2022-06-01,1,1,4,1,23,24,Bogotá,56013.950951
1,39915,Colombia,2022,6,2022-06-01,1,1,4,1,23,24,Ricaurte,7200.000000
2,39915,Colombia,2022,6,2022-06-01,1,1,4,1,23,24,Ibagué,7939.804003
3,39915,Colombia,2022,6,2022-06-01,1,1,4,1,23,24,Fusagasuga,1350.000000
4,39915,Colombia,2022,6,2022-06-01,1,1,4,1,23,24,Neiva,3427.161977
...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,39944,Colombia,2022,6,2022-06-30,1,24,5,5,27,24,Santa Marta,2468.729235
536,39944,Colombia,2022,6,2022-06-30,1,24,5,5,27,24,Bucaramanga,8741.949601
537,39944,Colombia,2022,6,2022-06-30,1,24,5,5,27,24,Cúcuta,1375.770455
538,39944,Colombia,2022,6,2022-06-30,1,24,5,5,27,24,Paraiso Central,987.000000


In [8]:
#Consulta historico de volumen despachado diario por planta

despachosSQL2 = querySQL("{CALL SCAC_AP8_BaseForecast (?,?,?)}", 
                         (pais, 
                          inicioHistoria.strftime("%Y-%m-%d"), 
                          finHistoria.strftime("%Y-%m-%d") 
                         ) 
                        )

despachosSQL2['year_month'] = despachosSQL2.FechaEntrega.dt.to_period('M')
despachosSQL2['totalEntregado'] = despachosSQL2['totalEntregado'].astype(float)

#agrego informacion geografica        
nombre_cluster = querySQL( 
    "SELECT Pais, Centro, Ciudad_Cluster as Ciudad, [Desc Cluster] as Cluster, Plantas_fijas FROM SCAC_AT1_NombreCluster where Pais = ?" , 
    (pais) )

despachosSQL2 = pd.merge(
    despachosSQL2, 
    nombre_cluster, 
    left_on ='Planta', 
    right_on='Centro', 
    how = 'left'
)

#obtengo el listado de ciudades y plantas que han estado activas los ultimos N dias
DesagregacionPronosticoPlanta = despachosSQL2[
    despachosSQL2['FechaEntrega'] >= (finHistoria - datetime.timedelta(30)) - MonthBegin(1) 
]

DesagregacionPronosticoPlanta = DesagregacionPronosticoPlanta.groupby(
    ['Ciudad', 'Planta', 'Plantas_fijas']
).size().reset_index()

DesagregacionPronosticoPlanta = DesagregacionPronosticoPlanta.drop([0], axis = 1)

DesagregacionPronosticoPlantaDia = pd.merge(DesagregacionPronosticoPlantaDia, DesagregacionPronosticoPlanta, on='Ciudad' )


#Calculo de proporciones
proportion_week_1 = calculate_proportion_testing(despachosSQL2, 
                                         1, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(1))

proportion_week_2 = calculate_proportion_testing(despachosSQL2, 
                                         2, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(2))

proportion_week_3 = calculate_proportion_testing(despachosSQL2, 
                                         3, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(3))

proportion_week_6 = calculate_proportion_testing(despachosSQL2, 
                                         6, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'Semana_Relativa'], 
                                         'totalEntregado', 
                                         ['Planta', 'Semana_Relativa'],
                                         'semana' + str(6))

proportion_weekday_1 = calculate_proportion_testing(despachosSQL2, 
                                         1, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(1))

proportion_weekday_2 = calculate_proportion_testing(despachosSQL2, 
                                         2, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(2))

proportion_weekday_3 = calculate_proportion_testing(despachosSQL2, 
                                         3, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(3))

proportion_weekday_6 = calculate_proportion_testing(despachosSQL2, 
                                         6, año_target, mes_target,
                                         ['Año', 'Mes', 'Planta'], 
                                         ['Año', 'Mes', 'Planta', 'DiaSemana'], 
                                         'totalEntregado', 
                                         ['Planta', 'DiaSemana'],
                                         'dia_semana' + str(6))
# Medias diarias
media_diaria1 = media_diaria_testing(despachosSQL2, 1, año_target, mes_target)
media_diaria2 = media_diaria_testing(despachosSQL2, 2, año_target, mes_target)
media_diaria3 = media_diaria_testing(despachosSQL2, 3, año_target, mes_target)
media_diaria6 = media_diaria_testing(despachosSQL2, 6, año_target, mes_target)

volumen_ciudad_mes = volumen_mes_testing(despachosSQL2, año_target, mes_target, finHistoria )

DesagregacionPronosticoPlantaDia.rename(
    columns={'Dia_Semana':'DiaSemana', 'Semana_relativa': 'Semana_Relativa', 'Plantas_fijas':'PlantaCentral'}, 
    inplace = True
)

In [9]:
datatesting = DesagregacionPronosticoPlantaDia[[
    'Año', 
    'Mes', 
    'Ciudad',
    'Planta',
    'Semana_Relativa', 
    'DiaSemana', 'PlantaCentral']]

datatesting['DiaSemana'] = datatesting['DiaSemana'].astype(int)

datatesting = pd.merge(datatesting, proportion_week_1, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
datatesting = pd.merge(datatesting, proportion_week_2, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
datatesting = pd.merge(datatesting, proportion_week_3, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')
datatesting = pd.merge(datatesting, proportion_week_6, on=['Año', 'Mes', 'Planta', 'Semana_Relativa'], how='left')

datatesting = pd.merge(datatesting, proportion_weekday_1, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
datatesting = pd.merge(datatesting, proportion_weekday_2, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
datatesting = pd.merge(datatesting, proportion_weekday_3, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')
datatesting = pd.merge(datatesting, proportion_weekday_6, on=['Año', 'Mes', 'Planta', 'DiaSemana'], how='left')

datatesting = pd.merge(datatesting, media_diaria1, on=['Año', 'Mes', 'Planta'], how='left')
datatesting = pd.merge(datatesting, media_diaria2, on=['Año', 'Mes', 'Planta'], how='left')
datatesting = pd.merge(datatesting, media_diaria3, on=['Año', 'Mes', 'Planta'], how='left')
datatesting = pd.merge(datatesting, media_diaria6, on=['Año', 'Mes', 'Planta'], how='left')

datatesting= pd.merge(datatesting, volumen_ciudad_mes, on=['Año','Mes', 'Ciudad'], how='left')

datatesting['PlantaCentral'] = np.select(
    [
        datatesting['PlantaCentral'] == 'Central'
    ],
    [
        1
    ],default=0)

datatesting = datatesting.fillna(0)


columnas = ['Año', 'Mes', 'Planta','Semana_Relativa', 'DiaSemana', 
            'proportion_semana1', 'proportion_semana2',
            'proportion_semana3', 'proportion_semana6', 
            'proportion_dia_semana1','proportion_dia_semana2', 
            'proportion_dia_semana3', 'proportion_dia_semana6', 
            'media1', 'media2', 'media3', 'media6',
            'PlantaCentral', 'volumen_ciudad']

datatesting = datatesting[columnas]


In [10]:
## aplicacion de label encoding y escalamiento de variables

#Cargar label encoders y escalador
le_planta = load(open('../label_encoders_scalers/le_planta.pkl', 'rb'))
min_max_scaler = load(open('../label_encoders_scalers/min_max_scaler.pkl', 'rb'))

datatesting['Planta'] = le_planta.transform(datatesting['Planta'])
num_vars = ['Año', 'Mes','Semana_Relativa','DiaSemana', 'media1', 'media2', 'media3', 'media6', 'volumen_ciudad']

datatesting_transformed = pd.concat([
    datatesting[['Planta', 'proportion_semana1', 'proportion_semana2', 'proportion_semana3',
       'proportion_semana6', 'proportion_dia_semana1',
       'proportion_dia_semana2', 'proportion_dia_semana3',
       'proportion_dia_semana6', 'PlantaCentral']].reset_index(),
    pd.DataFrame(min_max_scaler.transform(datatesting[num_vars].values), columns = num_vars )
],

axis = 1, 
    join = 'inner')

datatesting_transformed = datatesting_transformed.drop(['index'], axis = 1)

In [15]:
datatesting_transformed.to_csv('../datos/df_datatesting.csv')

In [None]:
# Cargar los modelos de regresion

m_rf = load(open('../models/modelo_rf.sav', 'rb'))
m_xgb = load(open('../models/modelo_xgb.sav', 'rb'))
m_rn = load_model('../models/modelo_rn.h5')

## MODELO CON RANDOM FOREST
y_pred_RF = m_rf.predict(datatesting_transformed)
df_RF = pd.DataFrame(y_pred_RF)
df_RF.rename(columns={0:'PrediccionRF'}, inplace = True)

In [None]:


## MODELO CON XGBOOST
y_pred_XG = m_xgb.predict(datatesting_transformed)
df_XG = pd.DataFrame(y_pred_XG)
df_XG.rename(columns={0:'PrediccionXG'}, inplace = True)

## MODELO CON REDES NEURONALES

y_pred_RN = m_rn.predict(datatesting_transformed)
df_RN = pd.DataFrame(y_pred_RN)
df_RN.rename(columns={0:'PrediccionRNR'}, inplace = True)

In [None]:
# Se agrega resultados de los modelos y se saca un consenso de IA

result_consejo = pd.concat([ datatesting_transformed, df_RN, df_RF, df_XG ], axis=1, join='inner')
result_consejo['consenso_ia'] = result_consejo[['PrediccionRNR','PrediccionRF','PrediccionXG']].median(axis=1)

# se hace la transformacion inversa de escaladores y label encodings
result_consejo['Planta'] = le_planta.inverse_transform(result_consejo['Planta'])

result_consejo = pd.concat([
    result_consejo[['Planta', 'proportion_semana1', 'proportion_semana2', 'proportion_semana3',
       'proportion_semana6', 'proportion_dia_semana1',
       'proportion_dia_semana2', 'proportion_dia_semana3',
       'proportion_dia_semana6', 'PlantaCentral',
       'PrediccionRNR','PrediccionRF','PrediccionXG', 'consenso_ia']].reset_index(),
    pd.DataFrame(min_max_scaler.inverse_transform(result_consejo[num_vars].values), columns = num_vars )
],

axis = 1, 
    join = 'inner')

result_consejo = result_consejo.drop(['index'], axis = 1)


In [20]:
result_consejo = pd.read_excel('../datos/ForecastV3.xlsx')

In [21]:
# ajustes finales

df_calendario = querySQL( 
    "SELECT * FROM SCAC_AT3_DiasHabilesFuente WHERE pais = ? and [Fecha de entrega]  between ? and ? " , 
    (pais, datetime.datetime(año_target, mes_target, 1).strftime("%Y-%m-%d"), 
     (datetime.datetime(año_target, mes_target, 1) + MonthEnd(1)).strftime("%Y-%m-%d") ))

df_calendario.rename(
    columns={'Fecha de entrega':'FechaEntrega', 'Dia_Semana': 'DiaSemana', 'Semana_relativa': 'Semana_Relativa'}, 
    inplace =True)


#agrego informacion geografica        
nombre_cluster = querySQL( 
    "SELECT Pais, Centro, Ciudad_Cluster as Ciudad, [Desc Cluster] as Cluster, [Planta Unica] as PlantaUnica FROM SCAC_AT1_NombreCluster where Pais = ?" , 
    (pais) )

df_calendario.Año = df_calendario.Año.astype(str)
df_calendario.Mes = df_calendario.Mes.astype(str)
df_calendario.DiaSemana = df_calendario.DiaSemana.astype(str)
df_calendario.Semana_Relativa = df_calendario.Semana_Relativa.astype(str)

result_consejo['DiaSemana'] = result_consejo['DiaSemana'].astype(float).round(0)
result_consejo['DiaSemana'] = result_consejo['DiaSemana'].astype(int)
result_consejo['DiaSemana'] = result_consejo['DiaSemana'].astype(str)

result_consejo['Semana_Relativa'] = result_consejo['Semana_Relativa'].astype(float).round(0)
result_consejo['Semana_Relativa'] = result_consejo['Semana_Relativa'].astype(int)
result_consejo['Semana_Relativa'] = result_consejo['Semana_Relativa'].astype(str)

result_consejo['Año'] = result_consejo['Año'].astype(float).round(0)
result_consejo['Año'] = result_consejo['Año'].astype(int)
result_consejo['Año'] = result_consejo['Año'].astype(str)

result_consejo['Mes'] = result_consejo['Mes'].astype(float).round(0)
result_consejo['Mes'] = result_consejo['Mes'].astype(int)
result_consejo['Mes'] = result_consejo['Mes'].astype(str)

result_consejo = pd.merge(result_consejo, df_calendario[['Año', 'Mes', 'DiaSemana', 'Semana_Relativa', 'FechaEntrega']], on=['Año', 'Mes', 'DiaSemana', 'Semana_Relativa' ], how='left')
result_consejo = pd.merge(result_consejo, nombre_cluster, left_on=['Planta'], right_on=['Centro'] )

#columnas_finales = ['Pais', 'Ciudad', 'Centro', 'PlantaUnica', 'FechaEntrega', 'PrediccionRNR', 'PrediccionRF',
#       'PrediccionXG', 'consenso_ia']

columnas_finales = ['Pais', 'Ciudad', 'Centro', 'PlantaUnica', 'FechaEntrega','consenso_ia']

In [22]:
result_consejo = result_consejo[columnas_finales]

In [23]:
result_consejo

Unnamed: 0,Pais,Ciudad,Centro,PlantaUnica,FechaEntrega,consenso_ia
0,Colombia,Bogotá,F001,CO-PLANTA 240,2022-06-01,360.700000
1,Colombia,Bogotá,F001,CO-PLANTA 240,2022-06-02,380.390000
2,Colombia,Bogotá,F001,CO-PLANTA 240,2022-06-03,414.954167
3,Colombia,Bogotá,F001,CO-PLANTA 240,2022-06-04,294.597083
4,Colombia,Bogotá,F001,CO-PLANTA 240,2022-06-05,95.747414
...,...,...,...,...,...,...
925,Colombia,Maceo,FA05,CO-PLANTA MACEO,2022-06-26,39.576118
926,Colombia,Maceo,FA05,CO-PLANTA MACEO,2022-06-27,25.425000
927,Colombia,Maceo,FA05,CO-PLANTA MACEO,2022-06-28,29.812500
928,Colombia,Maceo,FA05,CO-PLANTA MACEO,2022-06-29,31.582018


In [25]:
result_consejo.to_excel("../datos/Desagregacion_" + pais + "_" + pd.to_datetime("now").strftime("%Y-%m-%d-%H-%M-%S") + ".xlsx")

## Fase 5. Validación de resultados

In [28]:
pais = 'Colombia'
inicioHistoria = '2022-06-01'
finHistoria = '2022-06-08'

#Consulta historico de volumen despachado diario por planta

despachosSQL2 = querySQL("{CALL SCAC_AP8_BaseForecast (?,?,?)}", (pais, inicioHistoria, finHistoria ) )
despachosSQL2['year_month'] = despachosSQL2.FechaEntrega.dt.to_period('M')
despachosSQL2['totalEntregado'] = despachosSQL2['totalEntregado'].astype(float)

#agrego informacion geografica        
nombre_cluster = querySQL( 
    "SELECT Pais, Centro, Ciudad_Cluster as Ciudad, [Desc Cluster] as Cluster, [Planta Unica] as PlantaUnica FROM SCAC_AT1_NombreCluster where Pais = ?" , 
    (pais) )

despachosSQL2 = pd.merge(despachosSQL2, nombre_cluster, left_on ='Planta', right_on='Centro', how = 'left')



In [80]:
validacion = despachosSQL2.groupby([
    'Pais',
    'Cluster',
    'Ciudad',
    'PlantaUnica',
    'Planta',
    'FechaEntrega'
])['totalEntregado'].sum().reset_index()

validacion = pd.merge(validacion, result_consejo, on=['Planta', 'FechaEntrega'] )
validacion = pd.merge(
    validacion, 
    df_modelo_anterior[['Centro', 'FechaEntrega', 'M3Forecast']], 
    left_on=['Planta', 'FechaEntrega'], 
    right_on=['Centro', 'FechaEntrega'] )

In [76]:
validacion[validacion['Planta'] == 'F011']

Unnamed: 0,Pais,Cluster,Ciudad,PlantaUnica,Planta,FechaEntrega,totalEntregado,PrediccionRNR,PrediccionRF,PrediccionXG,consenso_ia
25,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-02,36.0,160.184219,164.8325,155.571014,160.184219
26,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-03,160.5,172.752197,184.2025,185.46701,184.2025
27,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-04,142.25,182.019012,235.955,212.390854,212.390854
28,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-05,110.0,186.061676,234.71,203.651443,203.651443
29,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-06,229.5,188.348602,230.197,226.557892,226.557892
30,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-07,146.25,136.710709,152.055,158.66391,152.055
31,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-09,122.5,166.640152,158.4975,163.400345,163.400345
32,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-10,208.0,182.113098,181.1825,190.12886,182.113098
33,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-11,268.25,191.909256,224.2775,216.293045,216.293045
34,Colombia,CLUSTER ANTIOQUIA,Medellín,CO-PLANTA MEDELLIN,F011,2022-05-12,162.75,199.318878,221.4,207.728348,207.728348


In [72]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['consenso_ia']))

58.789948701890424

In [73]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['PrediccionRNR']))

65.29273209853775

In [74]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['PrediccionRF']))

58.50169547870461

In [75]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['PrediccionXG']))

60.261731654411044

In [82]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['M3Forecast']))

62.50624807706623

In [26]:
df_modelo_anterior_consenso = querySQL( 
    "SELECT * FROM SCAC_AV7_DesagregacionPronosticoCiudadPlantaDiaTabla WHERE pais = ? and Version = ? ", 
    (pais, 'CONSENSO_JUN_2022' ))



In [27]:
df_modelo_anterior_otro = querySQL( 
    "SELECT * FROM SCAC_AV7_DesagregacionPronosticoCiudadPlantaDiaTabla WHERE pais = ? and Version = ? ", 
    (pais, 'OTRO_JUN_2022' ))


In [29]:
validacion = despachosSQL2.groupby([
    'Pais',
    'Cluster',
    'Ciudad',
    'PlantaUnica',
    'Planta',
    'FechaEntrega'
])['totalEntregado'].sum().reset_index()

#validacion = pd.merge(validacion, result_consejo, on=['Planta', 'FechaEntrega'] )
validacion = pd.merge(
    validacion, 
    df_modelo_anterior_consenso[['Centro', 'FechaEntrega', 'M3Forecast']], 
    left_on=['Planta', 'FechaEntrega'], 
    right_on=['Centro', 'FechaEntrega'] )
validacion = pd.merge(
    validacion, 
    df_modelo_anterior_otro[['Centro', 'FechaEntrega', 'M3Forecast']], 
    left_on=['Planta', 'FechaEntrega'], 
    right_on=['Centro', 'FechaEntrega'] )

In [31]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['M3Forecast_y']))

55.52778751421033

In [33]:
np.sqrt(metrics.mean_squared_error(validacion['totalEntregado'], validacion['M3Forecast_x']))

78.88029108508495

In [35]:
despachosSQL2 = querySQL("{CALL SCAC_AP8_BaseForecast (?,?,?)}", (pais, inicioHistoria, finHistoria ) )
despachosSQL2['year_month'] = despachosSQL2.FechaEntrega.dt.to_period('M')
despachosSQL2['totalEntregado'] = despachosSQL2['totalEntregado'].astype(float)

In [36]:
despachosSQL2

Unnamed: 0,Año,Mes,Planta,TipoPlanta,totalEntregado,Semana_Relativa,DiaSemana,FechaEntrega,DiasOperativos,year_month
0,2022,6,F001,Central,390.00,1,4,2022-06-01,1,2022-06
1,2022,6,F001,Central,456.00,1,5,2022-06-02,2,2022-06
2,2022,6,F001,Central,443.75,1,6,2022-06-03,3,2022-06
3,2022,6,F001,Central,251.00,1,7,2022-06-04,4,2022-06
4,2022,6,F001,Central,386.25,2,2,2022-06-06,5,2022-06
...,...,...,...,...,...,...,...,...,...,...
176,2022,6,FB89,CXO,48.50,1,5,2022-06-02,2,2022-06
177,2022,6,FB89,CXO,47.50,1,6,2022-06-03,3,2022-06
178,2022,6,FB89,CXO,56.00,1,7,2022-06-04,4,2022-06
179,2022,6,FB89,CXO,59.25,2,2,2022-06-06,5,2022-06


In [67]:
pais = 'Colombia'
inicioHistoria = '2022-05-01'
finHistoria = '2022-05-31'

#Consulta historico de volumen despachado diario por planta

despachosSQL2 = querySQL("{CALL SCAC_AP8_BaseForecast (?,?,?)}", (pais, inicioHistoria, finHistoria ) )
despachosSQL2['year_month'] = despachosSQL2.FechaEntrega.dt.to_period('M')
despachosSQL2['totalEntregado'] = despachosSQL2['totalEntregado'].astype(float)

#agrego informacion geografica        
nombre_cluster = querySQL( "SELECT Pais, Centro, Ciudad_Cluster as Ciudad, [Desc Cluster] as Cluster FROM SCAC_AT1_NombreCluster where Pais = ?" , (pais) )

despachosSQL2 = pd.merge(despachosSQL2, nombre_cluster, left_on ='Planta', right_on='Centro', how = 'left')

volumen_testing = despachosSQL2.groupby(['Ciudad'])['totalEntregado'].sum().reset_index()

volumen_testing.to_excel('../datos/testingv3.xlsx')