<a href="https://colab.research.google.com/github/AndresMontesDeOca/Laboratorio3/blob/main/Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle Experiments

## Libraries

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.callbacks import Callback


import warnings
# warnings.filterwarnings('ignore', category=ValueWarning)
warnings.filterwarnings('ignore')

# Ajustar la opción para mostrar más filas
# pd.set_option('display.max_rows', None)

# Si también quieres mostrar más columnas
# pd.set_option('display.max_columns', None)


# Vamos a suprimir la notacion cientifica
pd.set_option("display.float_format", lambda x:"%.2f" %x)


## Carga Datos

In [3]:
# Code to read csv file into Colaboratory:
# !pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

################################# Datasets ###################################
# # Ventas
id = "158aOjqxaNO8l97yA6VWJkek_15YVLMhs"
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('sell-in.txt')
data_ventas = pd.read_csv("sell-in.txt", sep="\t")
data_ventas['periodo'] = pd.to_datetime(data_ventas['periodo'], format='%Y%m')
data = data_ventas.copy()

# # Productos
id = "15JS_k86LS0sgJXma7BOVXWlyNcMwxdhE"
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('tb_productos.txt')
data_productos = pd.read_csv("tb_productos.txt", sep="\t")

# # Stocks
id = "15EV-8f_U7onpA1AcTxxXeD-z8yVR4fQu"
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('tb_stocks.txt')
data_stocks = pd.read_csv("tb_stocks.txt", sep="\t")
data_stocks['periodo'] = pd.to_datetime(data_stocks['periodo'], format='%Y%m')

# # Productos a predecir
id = "15LjADctFVwjzQFJvfJGFTEdgZx9xCoId"
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('productos_a_predecir.txt')
data_productos_a_predecir = pd.read_csv("productos_a_predecir.txt", sep="\t")





## Filter Data

In [34]:
def filter_data(data_all, data_filter):
    # Filtrar el DataFrame 'data_all' para que solo contenga los 'product_id' presentes en 'data_filter'
    data_filtered = data_all[data_all['product_id'].isin(data_filter['product_id'])]

    return data_filtered

## Group Data

In [33]:
# Siempre como values toma las toneladas vendidas
def group_data(data, column):
  grouped_data = data.groupby([column, 'periodo']).sum().reset_index()

  # Crea un DataFrame pivoteado donde las filas son las fechas y las columnas son los product_id
  pivot_data = grouped_data.pivot(index='periodo', columns=column, values='tn')

  # Asegúrate de que los nombres de las columnas sean strings
  pivot_data.columns = pivot_data.columns.astype(str)

  # Restablece el índice para asegurarse de que 'product_id' no sea un índice compuesto
  pivot_data.columns.name = None

  return pivot_data

## Fill Nulls

In [40]:
# Jugar con esto, no se si esta bien
def fill_nulls(data):
  # Primero usamos bfill para completar las ordenes mas viejas con los valores de las ordenes mas recientes
  data = data.bfill()
  # Luego completamos con ceros los productos que dejamos de vender, o se discontinuaron
  data = data.fillna(0)

  return data

In [5]:
# # Filtrar el DataFrame 'data' para que solo contenga los 'product_id' presentes en 'data_productos_a_predecir'
# data_filtered = data[data['product_id'].isin(data_productos_a_predecir['product_id'])]


# # Agrupa los datos por 'product_id' y 'periodo', y calcula la suma de 'tn'
# grouped_data = data_filtered.groupby(['product_id', 'periodo']).sum().reset_index()

# # Crea un DataFrame pivoteado donde las filas son las fechas y las columnas son los product_id
# pivot_data = grouped_data.pivot(index='periodo', columns='product_id', values='tn')

# # Rellena los NaN
# pivot_data = pivot_data.fillna(0)

# # Asegúrate de que los nombres de las columnas sean strings
# pivot_data.columns = pivot_data.columns.astype(str)

# # Restablece el índice para asegurarse de que 'product_id' no sea un índice compuesto
# pivot_data.columns.name = None

# data_2019 = pivot_data.loc['2019']

## Normalize Data

In [6]:
##########################################################################################
def normalize_data(df, normalization="MinMax"):
    """
    Normaliza cada serie de tiempo (columna) de manera individual usando MinMax o Zscore.

    Args:
        df (pd.DataFrame): DataFrame con series de tiempo de distintos productos, cada columna es un producto.
        normalization (str): Tipo de normalización a aplicar. Opciones: "MinMax" o "Zscore". Default es "MinMax".

    Returns:
        normalized_df (pd.DataFrame): DataFrame con las series normalizadas.
        normalization_params (dict): Diccionario con los parámetros necesarios para desnormalizar cada columna.
            - Para "MinMax": valores min y max de cada columna.
            - Para "Zscore": valores mean y std de cada columna.
    """
    normalization_params = {}
    normalized_df = pd.DataFrame()

    for column in df.columns:
        if normalization == "MinMax":
            min_value = df[column].min()
            max_value = df[column].max()
            normalization_params[column] = {"min": min_value, "max": max_value}
            normalized_df[column] = (df[column] - min_value) / (max_value - min_value)

        elif normalization == "ZScore":
            mean_value = df[column].mean()
            std_value = df[column].std()
            normalization_params[column] = {"mean": mean_value, "std": std_value}
            normalized_df[column] = (df[column] - mean_value) / std_value

        else:
            raise ValueError("Invalid normalization method. Choose 'MinMax' or 'ZScore'.")

    return normalized_df, normalization_params
##########################################################################################
def denormalize_series(normalized_series, normalization_params, normalization="MinMax"):
    """
    Desnormaliza una serie de tiempo usando los valores almacenados.

    Args:
        normalized_series (pd.Series): Serie con los datos normalizados.
        normalization_params (dict): Diccionario con los parámetros necesarios para desnormalizar cada serie.
            - Para "MinMax": valores min y max de cada serie.
            - Para "Zscore": valores mean y std de cada serie.
        normalization (str): Tipo de normalización a deshacer. Opciones: "MinMax" o "Zscore". Default es "MinMax".

    Returns:
        denormalized_series (pd.Series): Serie con los datos desnormalizados.
    """
    denormalized_series = pd.Series(index=normalized_series.index)

    for index in normalized_series.index:
        if str(index) in normalization_params:
            params = normalization_params[str(index)]
        else:
            raise KeyError(f"Index {index} not found in normalization parameters.")

        if normalization == "MinMax":
            min_value = params["min"]
            max_value = params["max"]
            denormalized_series[index] = normalized_series[index] * (max_value - min_value) + min_value

        elif normalization == "ZScore":
            mean_value = params["mean"]
            std_value = params["std"]
            denormalized_series[index] = normalized_series[index] * std_value + mean_value

        else:
            raise ValueError("Invalid normalization method. Choose 'MinMax' or 'ZScore'.")

    return denormalized_series
##########################################################################################

## Split Data

In [7]:
def split_data_all(data):
  data_train = data.loc['2017-01':'2018-12']
  data_valid = data.loc['2019-01':'2019-12']
  return data_train, data_valid

## Window Data

In [8]:
def window_dataset(sequence, data_split, window_size, batch_size, n_future, shuffle_buffer=1000, seed=None):
    """Generates dataset windows for multi-step forecasting in a multivariable context.

    Args:
      sequence (array-like): Contains the values of the time series, where each element is an array of feature values.
      data_split (str): Specifies if the dataset is for training or validation/test.
      window_size (int): The number of time steps to include in the feature.
      batch_size (int): The batch size.
      n_future (int): The number of future steps to predict.
      shuffle_buffer (int): Buffer size to use for the shuffle method.
      seed (int, optional): Random seed for reproducibility.

    Returns:
      tf.data.Dataset: TF Dataset containing time windows.
    """

    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(sequence)

    # Window the data but only take those with the specified size
    dataset = dataset.window(window_size + n_future, shift=1, drop_remainder=True)

    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda window: window.batch(window_size + n_future))

    # Create tuples with features and labels
    dataset = dataset.map(lambda window: (window[:window_size], window[window_size:]))

    if data_split == 'train':
        # Shuffle the training data to improve generalization
        dataset = dataset.shuffle(shuffle_buffer, seed=seed)
    else:
        # Cache the validation/test data for improved performance
        dataset = dataset.cache()

    # Create batches of windows and prefetch for performance
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return dataset


## Callbacks

In [9]:
#############################################################################

class MAEThresholdCallback(Callback):
    def __init__(self, threshold=0.15):
        super(MAEThresholdCallback, self).__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        val_mae = logs.get('val_mae')
        if val_mae is not None and val_mae <= self.threshold:
            print(f'\nEpoch {epoch+1}: Validation MAE has reached {val_mae:.4f}, stopping training.')
            self.model.stop_training = True

def MyCallbacks(model_name, patience):
    earlystop = tf.keras.callbacks.EarlyStopping('val_loss', patience=patience, restore_best_weights=True)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=f'ckpts/{model_name}-' + '{epoch:02d}-{val_loss:.4f}.h5', monitor='val_loss')
    mae_threshold_callback = MAEThresholdCallback(threshold=0.015)
    return [earlystop] #, checkpoint] #, mae_threshold_callback]

#############################################################################

## Model Design

In [10]:
#############################################################################
def compile_model(new_model, loss):
  new_model.compile(optimizer='adam', loss=loss, metrics=['mae']) # metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
  print(new_model.summary())
  return new_model
#############################################################################
def MyModel(loss, window_size, n_future, n_features):
    new_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer((window_size, n_features)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=False)),
        # tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(n_features * n_future, activation='relu'),
        tf.keras.layers.Reshape((n_future, n_features)),
        ])
    return compile_model(new_model, loss)

# Pipeline

## Data Preprocessing

In [16]:
# data
# data_productos
# data_stocks
# data_productos_a_predecir
window_size = 6
n_future = 2
batch_size = 4
normalization = 'MinMax'

# #########################################################################
# Old Pipeline
# #########################################################################
# data_all = group_data(data, data_productos_a_predecir)
# data_all_norm, data_all_norm_params = normalize_data(data_all, normalization=normalization)
# data_all_norm['20001'].describe()
# data_train, data_valid = split_data_all(data_all_norm)
# print(data_train.shape)
# print(data_valid.shape)
# data_train = data_all_norm
# data_train_windowed = window_dataset(data_train, data_split='train', window_size=window_size, batch_size=batch_size, n_future=n_future)
# data_valid_windowed = window_dataset(data_valid, data_split='valid', window_size=window_size, batch_size=batch_size, n_future=n_future)



## Model Train

In [53]:
# # # # #########################################################################
# # # # # Neural Netowrk Model
# # # #########################################################################
# model_name = 'TimeSeries'
# loss = 'mse'
# patience = 30
# epochs = 500
# n_features = data_all.shape[1]

# callbacks = MyCallbacks(model_name, patience)
# model = MyModel(loss, window_size, n_future, n_features)

# history = model.fit(
#     data_train_windowed,
#     # validation_data = data_valid_windowed,
#     # validation_split=0.2,
#     # callbacks = callbacks,
#     epochs=epochs)

# # plot_history(history, 4)
# # save_model(model, model_name, history, data_test_wrangled)
# # show_predictions(model, data_test_wrangled, data_test[n_past:])

In [54]:
# # Supongamos que `data_all` contiene tus datos históricos, incluyendo 2019.
# # data_all debe tener la forma (n_samples, n_features)

# # Convertir el DataFrame a un array de NumPy
# data_all_array = data_all.values

# # Extraer la última ventana de datos de 2019 para predecir enero de 2020
# column_names = data_all.columns  # Obtener los nombres de las columnas

# # Extraer los últimos `window_size` meses de 2019
# input_data = data_all_array[-window_size:].reshape((1, window_size, n_features))

# # Predecir enero de 2020
# pred_january = model.predict(input_data)

# # Asegurarse de que la predicción tenga la forma correcta
# pred_january = pred_january.reshape((1, n_future, n_features))

# # Crear un DataFrame para la predicción de enero de 2020
# pred_january_df = pd.DataFrame(pred_january[0], columns=column_names)
# pred_january_df.index = pd.date_range(start='2020-01-01', periods=n_future, freq='MS')

# # Actualizar la ventana de entrada para predecir febrero de 2020
# input_data = np.append(input_data[:, 1:, :], pred_january[:, 0, :].reshape(1, 1, n_features), axis=1)

# # Predecir febrero de 2020
# pred_february = model.predict(input_data)

# # Asegurarse de que la predicción tenga la forma correcta
# pred_february = pred_february.reshape((1, n_future, n_features))

# # Crear un DataFrame para la predicción de febrero de 2020
# pred_february_df = pd.DataFrame(pred_february[0], columns=column_names)
# pred_february_df.index = pd.date_range(start='2020-02-01', periods=n_future, freq='MS')

# # Predije dos veces, una volviendo a entrenar con los datos predichos de Enero 2020, y la otra no.
# pred_1 = pred_january_df.loc['2020-02-01']

# pred_1_denorm = denormalize_series(pred_1, data_all_norm_params, normalization=normalization)
# data_pred1_denorm = pred_1_denorm.reset_index()
# data_pred1_denorm.columns = ['product_id', 'tn']
# data_pred1_denorm.to_csv('pred_1.csv', index=False)

# EDA

In [75]:
# Evidentemente cuando el profe deidentifico los customers, lo hizo asignandoles ID secuenciales al listado ordenado por la suma de ventas(tn)
print('Listado de Clientes, ordenados por la sumatoria de ventas en tn:\n', group_data(data, 'customer_id').sum(), '\n')

# Lo mismo cuando deidentifico a los productos, solo que esta vez empezo desde 20000
print('Listado de Productos, ordenados por la sumatoria de ventas en tn:\n', group_data(data, 'product_id').sum())

Listado de Clientes, ordenados por la sumatoria de ventas en tn:
 10001   109203.60
10002    77333.17
10003    71375.92
10004    63065.94
10005    51467.05
           ...   
10633        0.14
10634        0.10
10635        0.10
10636        0.04
10637        0.00
Length: 597, dtype: float64 

Listado de Productos, ordenados por la sumatoria de ventas en tn:
 20001   50340.40
20002   36337.25
20003   32004.15
20004   24178.15
20005   23191.22
          ...   
21295       0.01
21296       0.01
21297       0.01
21298       0.01
21299       0.01
Length: 1233, dtype: float64


In [58]:
data

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,2017-01-01,10234,20524,0,2,0.05,0.05
1,2017-01-01,10032,20524,0,1,0.14,0.14
2,2017-01-01,10217,20524,0,1,0.03,0.03
3,2017-01-01,10125,20524,0,1,0.02,0.02
4,2017-01-01,10012,20524,0,11,1.54,1.54
...,...,...,...,...,...,...,...
2945813,2019-12-01,10105,20853,0,1,0.02,0.02
2945814,2019-12-01,10092,20853,0,1,0.01,0.01
2945815,2019-12-01,10006,20853,0,7,0.03,0.03
2945816,2019-12-01,10018,20853,0,4,0.02,0.02
