In [1]:
!pip install joblib --break-system-packages

Defaulting to user installation because normal site-packages is not writeable


# Training an deep learning model 

The notebook's purpose is to build a model using tensorflow and LSTM layers to predict the future sales based on the data in the datawarehouse

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Bidirectional, LSTM, Dense

import sys
sys.path.append('../../libraries')
from database import get_results

2025-04-01 06:00:50.494637: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-01 06:00:51.525016: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-01 06:00:52.100179: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743501652.703530    6297 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743501652.956446    6297 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-01 06:00:54.247474: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [3]:
def load_data(skus, site_codes):
    """Loads the data from cloud storage"""
    formatted_skus = ', '.join(f"'{sku}'" for sku in skus)

    # Format site_codes for SQL IN clause
    formatted_site_codes = ', '.join(f"'{site_code}'" for site_code in site_codes)
    query = f"""
SELECT 
    s.sku, 
    SUM(s.quantity) as quantity, 
    s.date, 
    s.site_code, 
    p.category
FROM 
    sales s
LEFT JOIN 
    products p on p.product_code = s.sku
WHERE 
    s.sku IN ({formatted_skus}) AND s.site_code IN ({formatted_site_codes})
GROUP BY 
    s.date, s.sku, p.category, s.site_code
ORDER BY 
    s.site_code, s.sku, s.date;
    """
    return get_results(query)

In [4]:
def get_skus(): 
    return get_results('''
SELECT 
    product_code AS sku
FROM 
    products
    ''').sku.values
def get_site_codes(): 
    return get_results('''
SELECT 
    DISTINCT site_code
FROM 
    sales
    ''').site_code.values

In [5]:
def build_bidirectional_lstm_model(num_numerical_features, num_unique_site_codes, num_unique_skus, num_unique_categories, num_unique_seasons):
    numerical_input = Input(shape=(None, num_numerical_features), name='numerical_input')
    site_code_input = Input(shape=(None,), dtype='int32', name='site_code_input')
    sku_input = Input(shape=(None,), dtype='int32', name='sku_input')
    category_input = Input(shape=(None,), dtype='int32', name='category_input')
    season_input = Input(shape=(None,), dtype='int32', name='season_input')

    site_code_embedding = Embedding(input_dim=num_unique_site_codes, output_dim=10)(site_code_input)
    sku_embedding = Embedding(input_dim=num_unique_skus, output_dim=20)(sku_input)
    category_embedding = Embedding(input_dim=num_unique_categories, output_dim=8)(category_input)
    season_embedding = Embedding(input_dim=num_unique_seasons, output_dim=5)(season_input)

    concatenated_inputs = Concatenate(axis=-1)([numerical_input, site_code_embedding, sku_embedding, category_embedding, season_embedding])
    lstm = Bidirectional(LSTM(64, return_sequences=True))(concatenated_inputs)
    lstm = Bidirectional(LSTM(32))(lstm)
    output = Dense(1)(lstm)

    model = Model(inputs=[numerical_input, site_code_input, sku_input, category_input, season_input], outputs=output)
    model.compile(optimizer='adam', loss='mse')
    return model

In [6]:
def create_label_encoders():
    skus, site_codes = get_skus(), get_site_codes()
    all_site_codes = site_codes
    all_skus = skus
    all_categories = get_results('''
SELECT 
    DISTINCT category 
FROM 
    products
    ''').category.values.astype(str)
    all_seasons = [i for i in range(0, 4)]

    site_code_encoder = LabelEncoder()
    site_code_encoder.fit(all_site_codes)
    sku_encoder = LabelEncoder()
    sku_encoder.fit(all_skus)
    category_encoder = LabelEncoder()
    category_encoder.fit(all_categories)
    season_encoder = LabelEncoder()
    season_encoder.fit(all_seasons)

    encoders = {
        "site_code": site_code_encoder,
        "sku": sku_encoder,
        "category": category_encoder,
        "season": season_encoder,
    }

    return encoders

In [7]:
def preprocess_data(df_batch, encoders):
    df_batch['date'] = pd.to_datetime(df_batch['date'])
    df_batch['day_of_week'] = df_batch['date'].dt.dayofweek
    df_batch['month'] = df_batch['date'].dt.month

    df_batch['site_code'] = encoders['site_code'].transform(df_batch['site_code'])
    df_batch['sku'] = encoders['sku'].transform(df_batch['sku'])
    df_batch['category'] = encoders['category'].transform(df_batch['category'])
    df_batch['season'] = df_batch['date'].apply(lambda x: (x.month - 1) // 3)
    df_batch['season'] = encoders['season'].transform(df_batch['season'])

    numerical_features = ['quantity', 'day_of_week', 'month']
    scaler = StandardScaler()
    df_batch[numerical_features] = scaler.fit_transform(df_batch[numerical_features])

    return df_batch, scaler

In [8]:
def create_batch(df_batch, sequence_length):
    numerical_data = df_batch[['quantity', 'day_of_week', 'month']].values
    site_code_data = df_batch['site_code'].values
    sku_data = df_batch['sku'].values
    category_data = df_batch['category'].values
    season_data = df_batch['season'].values
    target_data = df_batch['quantity'].values

    numerical_batches = []
    site_code_batches = []
    sku_batches = []
    category_batches = []
    season_batches = []
    target_batches = []

    for i in range(0, len(df_batch) - sequence_length):
        numerical_batches.append(numerical_data[i:i + sequence_length])
        site_code_batches.append(site_code_data[i:i + sequence_length])
        sku_batches.append(sku_data[i:i + sequence_length])
        category_batches.append(category_data[i:i + sequence_length])
        season_batches.append(season_data[i:i + sequence_length])
        target_batches.append(target_data[i + sequence_length])

    numerical_batches = pad_sequences(numerical_batches, dtype='float32')
    site_code_batches = pad_sequences(site_code_batches, dtype='int32')
    sku_batches = pad_sequences(sku_batches, dtype='int32')
    category_batches = pad_sequences(category_batches, dtype='int32')
    season_batches = pad_sequences(season_batches, dtype='int32')
    target_batches = np.array(target_batches)

    return numerical_batches, site_code_batches, sku_batches, category_batches, season_batches, target_batches


In [15]:
def train_batches(sequence_length=10, batch_size=32, model_save_path="model_stmp.h5", encoders_save_path="encoders.pkl"):
    encoders = create_label_encoders()

    num_numerical_features = 3
    num_unique_site_codes = len(encoders['site_code'].classes_)
    num_unique_skus = len(encoders['sku'].classes_)
    num_unique_categories = len(encoders['category'].classes_)
    num_unique_seasons = len(encoders['season'].classes_)

    model = build_bidirectional_lstm_model(num_numerical_features, num_unique_site_codes, num_unique_skus, num_unique_categories, num_unique_seasons)

    skus, site_codes = get_skus(), get_site_codes()

    for site_code in site_codes:
        batch_df = load_data(skus, [site_code])
        if batch_df.shape[0] == 0: continue
        batch_df, scaler = preprocess_data(batch_df, encoders)
        numerical_batches, site_code_batches, sku_batches, category_batches, season_batches, target_batches = create_batch(batch_df, sequence_length)
        model.fit(
            [numerical_batches, site_code_batches, sku_batches, category_batches, season_batches],
            target_batches,
            epochs=5,
            batch_size=batch_size,
            verbose=1
        )
        print(f'Model trained with SITE CODE: {site_code}') 

    model.save(model_save_path)
    encoders["scaler"] = scaler
    joblib.dump(encoders, encoders_save_path)

    return model, encoders

In [16]:
model, mappings = train_batches()

Epoch 1/5
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - loss: 0.9967
Epoch 2/5
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 1.0049
Epoch 3/5
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.9997
Epoch 4/5
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 1.0161
Epoch 5/5
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.9899
Model trained with SITE CODE: AUS000
Epoch 1/5
[1m958/958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 1.0148
Epoch 2/5
[1m958/958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.9972
Epoch 3/5
[1m958/958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 0.9826
Epoch 4/5
[1m958/958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 0.9832 
Epoch 5/5
[1m958/958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Model trained with SITE CODE: USA004
