## Parte 1

**Problema:**

-O cliente precisa de atualizar o armazenamento de roupa em cada loja para Outubro de 2019.

**Objetivo:**

-Prever com precisão as vendas semanais considerando sazonalidade,tendência e variáveis explicativas. Para complementar, deverá ser feito uma análise dos dados que serão utilizados no modelo.

## Seleção de Modelos para cada Store_id

In [11]:
# carregar bibliotecas
library(dplyr)
library(forecast)
library(tidyverse)
library(xts)
library(ggplot2)
library(tidyverse)

In [2]:
# Carregar dados de metricas dos diversos modelos
path <- "C:/Users/Egomes/Desktop/PG_Analytics_Data_science_empresarial/Isla_gaia/14-ProjetoII/projetoII/dataStaging/accuracySimpleMethods.csv"
simpleModels <- read.csv(path)
rm(path)

path <- "C:/Users/Egomes/Desktop/PG_Analytics_Data_science_empresarial/Isla_gaia/14-ProjetoII/projetoII/dataStaging/accuracySimpleLineaRegressionModels.csv"
lrModels <- read.csv(path)
rm(path)

path <- "C:/Users/Egomes/Desktop/PG_Analytics_Data_science_empresarial/Isla_gaia/14-ProjetoII/projetoII/dataStaging/accuracyEtsModels.csv"
etsModels <- read.csv(path)
rm(path)

path <- "C:/Users/Egomes/Desktop/PG_Analytics_Data_science_empresarial/Isla_gaia/14-ProjetoII/projetoII/dataStaging/accuracyAutoArimaSarimaModels.csv"
arimaModels <- read.csv(path)
rm(path)

path <- "C:/Users/Egomes/Desktop/PG_Analytics_Data_science_empresarial/Isla_gaia/14-ProjetoII/projetoII/dataStaging/accuracyArimaXregModels.csv"
arimaxregModels <- read.csv(path)
rm(path)

In [3]:
# Agregar dataframes em um unico
result <- left_join(simpleModels, etsModels, by = "store_id")
result <- left_join(result, lrModels, by = "store_id")
result <- left_join(result, arimaModels, by = "store_id")
result <- left_join(result, arimaxregModels, by = "store_id")
# Analytics dataframe
result <- result %>%
  pivot_longer(cols = -c(store_id), names_to = "metrics_models", values_to = "values")
# Dividir a coluna em duas partes usando o delimitador "_"
result <- separate(result, metrics_models, into = c("metrics", "model"), sep = "_")
head(result,10)

store_id,metrics,model,values
<chr>,<chr>,<chr>,<dbl>
S0002,ME,avg,-1586.15
S0002,RMSE,avg,1949.76841
S0002,MAE,avg,1661.294
S0002,MPE,avg,-48.61882
S0002,MAPE,avg,50.92214
S0002,ME,naive,11435.06
S0002,RMSE,naive,11491.14103
S0002,MAE,naive,11435.06
S0002,MPE,naive,70.22427
S0002,MAPE,naive,70.22427


In [4]:
# Tendo o foco na precisão da previsão o uso da media  RMSE penaliza erros maiores de forma mais significativa
# O RMSE também é sensível a outliers, pois os valores elevados ao quadrado aumentam a influência desses pontos.
# Raiz Quadrada do Erro Médio Quadrático (RMSE): Calcula a raiz quadrada do MSE, Fornece uma métrica de erro médio em unidades originais. 
# O RMSE é sensível a grandes desvios e pode ser útil quando erros maiores são penalizados de forma desproporcional.

# Selecionar metrica == RMSE
result <- result[result$metrics == "RMSE", ]
dim(result)
result <- result[complete.cases(result),]
# Retirar NaN existentes
dim(result)

In [5]:
# Criar lista de dataframes em que cada elemento é uma store
# criar variável com lista das lojas unicas
stores <- unique(result$store_id)
# Lista para armazenar os dataframes
df_list <- list()
# Loop para criar os dataframes separados
for (store in stores) {
  # filtrar dataframe
  df_store <- result[result$store_id == store,]
  df_store <- df_store[, c("store_id", "model", "values")]
  # Adicionar dataframe à lista
  df_list[[store]] <- df_store
}
# Selecionar o modelo com RMSE mais baixo
# Criar dataframe vazio
df_select_models <- data.frame(store_id = character(), 
                        model = character(), 
                        stringsAsFactors = FALSE)

for (i in names(df_list)){
    data <- df_list[[i]]
    # Selecionando a linha com o menor RMSE
    row_rmse <- data[which.min(data$values), ] # menor rmse modelo
    row_rmse <- row_rmse[, c("store_id", "model", "values")]
    # Adicionar nova linha ao dataframe
    df_select_models <- rbind(df_select_models, row_rmse)
}
nrow(df_select_models)

In [6]:
# Ver numero de Lojas por modelo
df_select_models %>%
  group_by(model) %>%
  summarise(nr_stores = n())

model,nr_stores
<chr>,<int>
avg,10
ets,1
holtdamped,4
lm,26
snaive,4
stlf,2
tslm,16


## Final Predictions

In [7]:
# Carregar dados Semanais
path <- "https://raw.githubusercontent.com/E-man85/projectII/main/dataStaging/grainWeekMultipleVariables.csv"
df <- read.csv(path)
# Alterar tipo da coluna date
df$date <- as.Date(df$date)
# criar variável com lista das lojas unicas
stores <- unique(df$store_id)
# Lista para armazenar os dataframes
df_list <- list()
# Loop para criar os dataframes separados
for (store in stores) {
  # filtrar dataframe
  df_store <- df[df$store_id == store,]
  # Adicionar dataframe à lista
  df_list[[store]] <- df_store
}
# Converter em timeseries ts
ts_list <- list()
# Loop
for(t in names(df_list)){
    data <- df_list[[t]]
    ts <- ts(data = data$summed_revenue, frequency = 52)
    ts_list[[t]] <- ts
}
length(ts_list)

In [8]:
# model linear regression
forecast_values_lm <- list()
#loop
for (df in names(df_list)){
  if (df %in% c("S0002", "S0003", "S0010","S0012", "S0020", "S0023","S0030", "S0038", 
                "S0050","S0052","S0059","S0062","S0072","S0073","S0076","S0085","S0086",
                "S0089","S0091","S0095","S0102","S0107","S0109","S0122","S0126","S0132")) {
    data <- df_list[[df]]
    model <- lm(summed_revenue ~ date, data = data)
    novos_dados <- data.frame(date = seq(max(data$date) + 7, length.out = 5, by = 7))
    forecast_result <- forecast(model, newdata = novos_dados)
    previsoes <- round(forecast_result$mean,2)
    forecast_values_lm[[df]] <- previsoes
  }
}
length(forecast_values_lm)

# model tslm 
forecast_values_tslm <- list()
for (ts in names(ts_list)){
  if (ts %in% c("S0014", "S0016", "S0026","S0036","S0039","S0041","S0056","S0058","S0067","S0080","S0094","S0099","S0104","S0108","S0131","S0141")) {
    data <- ts_list[[ts]]
    model <- tslm(data ~ trend + season)
    forecast_result <- forecast(model, h = 5)
    forecast_values_tslm[[ts]] <- round(forecast_result$mean,2)
  } 
}
length(forecast_values_tslm)

# model ETS (Error, Trend, Seasonality) 
forecast_values_ets <- list()
for (ts in names(ts_list)){
  if (ts %in% c("S0055")) {
    data <- ts_list[[ts]]
    data <- tail(data, 24)
    forecast_ets <- forecast(ets(data), h = 5)
    forecast_values_ets[[ts]] <- round(forecast_ets$mean,2)
  }
}
length(forecast_values_ets)

# model STLF (Seasonal and Trend decomposition using Loess and Fourier) 
forecast_values_stls <- list()
# Loop
for (ts in names(ts_list)){
  if (ts %in% c("S0077", "S0120")) {
    data <- ts_list[[ts]]
    forecast_stlf <- stlf(data,5)
    forecast_values_stls[[ts]] <- round(forecast_stlf$mean,2)
  }
}
length(forecast_values_stls)

# snaive
forecast_values_snaive <- list()
# loop
for (ts in names(ts_list)) {
  if (ts %in% c("S0040", "S0045","S0142")) {
  data <- ts_list[[i]]
  forecast_snaive <- round(snaive(data, 5)$mean,2)
  forecast_values_snaive[[ts]] <- forecast_snaive
  }
}
length(forecast_values_snaive)

# Modelo loja S0136	arimaXreg
forecast_values_arimaxreg <- list()
for (ts in names(ts_list)) {
  if (ts %in% c("S0136")) {
    data <-df_list[[ts]]
  # Acrescentar variavel open_closed
    data$open_closed <- ifelse(data$number_month %in% c(1, 2, 3, 4, 9, 10, 11, 12), 0, 1)
    # selecionar colunas
    data <- data[, c('store_id','date', 'summed_revenue','open_closed')]
    ts_data <- as.matrix(data[, !(names(data) %in% c('store_id', 'date'))])
    exogenous_var <- ts_data[, "open_closed"]
    model <- auto.arima(ts_data[,"summed_revenue"], xreg = exogenous_var)
  # Realizar previsões
    ts_predict <- as.matrix(rep(0,5)) # variável exogena para a previsão
    exogenous_var <- ts_predict
    forecast <- forecast(model, xreg = exogenous_var , h = 5)
    forecast_values_arimaxreg[[ts]] <- forecast$mean
  }
}
length(forecast_values_arimaxreg)

# Average
forecast_values_avg <- list()
# loop
for (ts  in names(ts_list)){
  if (ts %in% c("S0005","S0015","S0022","S0032","S0046","S0061","S0083","S0088","S0092","S0097")){
    data <- ts_list[[ts]]
    avg_forecast <- round(meanf(data,5)$mean,2)
    forecast_values_avg[[ts]] <- avg_forecast
  }
}
length(forecast_values_avg)

# holtdamped
forecast_values_holtdamped <- list()
# Loop
for (ts in names(ts_list)){
  if (ts %in% c("S0007","S0068","S0071","S0143")){
    data <- ts_list[[ts]]
    model_holt_damped <- forecast(holt(data, damped=TRUE), h = 5)
    forecast_values_holtdamped[[ts]] <- round(model_holt_damped$mean,2)
  }
}
length(forecast_values_holtdamped)

In [9]:
# listas com as predicts
predict_list <-list(forecast_values_lm, forecast_values_tslm, forecast_values_ets, forecast_values_stls, forecast_values_snaive, forecast_values_arimaxreg, forecast_values_avg, forecast_values_holtdamped)
# Criar dataframes dos resultados
df_lm <- data.frame(lapply(forecast_values_lm, as.vector))
df_tslm <- data.frame(lapply(forecast_values_tslm, as.vector))
df_ets <- data.frame(lapply(forecast_values_ets, unname))
df_stls <- data.frame(lapply(forecast_values_stls, as.vector))
df_snaive <- data.frame(lapply(forecast_values_snaive, as.vector))
df_arimaxreg <- data.frame(lapply(forecast_values_arimaxreg, as.vector))
df_avg <- data.frame(lapply(forecast_values_avg, as.vector))
df_holtdamped <- data.frame(lapply(forecast_values_holtdamped, as.vector))
# Criar o dataframe previsões finais
# Criar um dataframe vazio 
final_results <- data.frame(dates = as.Date(character()),
                            stringsAsFactors = FALSE)
dates <- seq(as.Date("2019-09-30"), by = "7 days", length.out = 5)
final_results <- data.frame(date = dates)
# Acrescentar dataframes
final_results <- cbind(final_results, df_lm, df_tslm, df_ets, df_stls, df_snaive, df_arimaxreg, df_avg, df_holtdamped)
# Ordenar as colunas em ordem alfabética
final_results <- final_results[, order(names(final_results))]
head(final_results)

Unnamed: 0_level_0,date,S0002,S0003,S0005,S0007,S0010,S0012,S0014,S0015,S0016,⋯,S0109,S0120,S0122,S0126,S0131,S0132,S0136,S0141,S0142,S0143
Unnamed: 0_level_1,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2019-09-30,4706.12,1322.53,1171.85,917.79,2977.42,1916.68,1493.87,2457.84,901.93,⋯,1968.14,505.11,1773.47,3529.85,1991.38,3282.85,0,511.31,586.41,665.96
2,2019-10-07,4725.4,1328.57,1171.85,924.48,2985.07,1924.57,1352.55,2457.84,1141.5,⋯,1994.75,566.42,1781.59,3538.68,2489.09,3298.69,0,422.05,531.52,666.02
3,2019-10-14,4744.68,1334.62,1171.85,931.05,2992.73,1932.46,1560.39,2457.84,927.07,⋯,2021.37,535.95,1789.7,3547.51,2070.11,3314.53,0,332.02,497.22,666.07
4,2019-10-21,4763.96,1340.66,1171.85,937.48,3000.38,1940.35,1433.16,2457.84,744.68,⋯,2047.98,528.0,1797.82,3556.34,1703.53,3330.37,0,394.36,392.12,666.12
5,2019-10-28,4783.24,1346.71,1171.85,943.78,3008.04,1948.25,1218.01,2457.84,721.01,⋯,2074.59,642.16,1805.94,3565.18,1919.17,3346.22,0,358.94,380.14,666.17


In [10]:
# Exportar resultados
path <- file.path("C:", "Users", "Egomes", "Desktop", "PG_Analytics_Data_science_empresarial", "Isla_gaia", "14-ProjetoII", "projetoII", "dataStaging", "finalPredictionsModels.csv")
write.csv(final_results, file = path, row.names = FALSE)