In [127]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import joblib

In [104]:
# --- 1. GLOBAL ENCODERS & SCALERS ---
le_marca = LabelEncoder()
le_plant = LabelEncoder()
feature_scaler = MinMaxScaler()

In [105]:
df = pd.read_csv("consolidated_ordenes.csv")
df = df.drop(columns=["description", "main_plant", "changed_on"])


In [106]:
# 1. GROUPING & PREPROCESSING
# Assuming 'df' is your original DataFrame
df = df.sort_values(by=['equipment', 'created_on'], ascending=[True, True])

In [107]:
# Grouping into a list of DataFrames (The "Pandas Way")
list_of_dfs = [group for name, group in df.groupby('equipment')]

In [108]:
def prepare_data_for_rnn(list_of_dfs, window_size=10):
    all_x, all_y = [], []
    
    # We use separate scalers: one for features, one for the target (days_diff)
    # This makes it easier to "un-scale" your prediction later
    target_scaler = MinMaxScaler()
    
    # Flattening for encoding purposes
    full_df = pd.concat(list_of_dfs)
    full_df['marca_enc'] = le_marca.fit_transform(full_df['marca'].astype(str))
    full_df['plant_enc'] = le_plant.fit_transform(full_df['plant'].astype(str))
    
    # Re-split after encoding
    processed_dfs = [group for name, group in full_df.groupby('equipment')]
    
    for sub_df in processed_dfs:
        sub_df = sub_df.copy()
        sub_df['created_on'] = pd.to_datetime(sub_df['created_on'])
        
        # Calculate days between orders (The "Sequence" logic)
        sub_df['days_diff'] = sub_df['created_on'].diff().dt.days.fillna(0)
        
        # Select Features: [days_diff, modelo, marca_enc, plant_enc]
        feature_cols = ['days_diff', 'modelo', 'marca_enc', 'plant_enc']
        
        if len(sub_df) <= window_size:
            continue # Skip equipment with too little history
            
        # Scaling
        scaled_features = feature_scaler.fit_transform(sub_df[feature_cols])
        
        # Sliding Window Generation
        for i in range(len(scaled_features) - window_size):
            # Input: The window of rows
            all_x.append(scaled_features[i : i + window_size])
            # Target: The 'days_diff' of the VERY NEXT row
            all_y.append(scaled_features[i + window_size, 0]) 
            
    return np.array(all_x), np.array(all_y), feature_scaler

In [109]:
# 2. GENERATE SEQUENCES
WINDOW_SIZE = 40 # Looking at 12 orders to predict the 13th
X, y, scaler = prepare_data_for_rnn(list_of_dfs, window_size=WINDOW_SIZE)

In [110]:
# Split data into Training (80%) and Testing (20%) sets
# shuffle=True is okay here because our "windows" already contain the temporal order
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
# --- 3. BUILD THE RNN (LSTM) ---
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
    layers.LSTM(64, activation='tanh', return_sequences=True),
    layers.LSTM(64, activation='tanh', return_sequences=True),
    layers.LSTM(64, activation='tanh', return_sequences=False),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1)
])

In [112]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [113]:
# --- 4. TRAIN ---
print(f"Training on {X_train.shape[0]} samples, validating on {X_test.shape[0]} samples...")

history = model.fit(
    X_train, y_train, 
    epochs=25, 
    batch_size=16, 
    validation_data=(X_test, y_test), # Use the test set for validation during training
    verbose=1
)

Training on 19391 samples, validating on 4848 samples...
Epoch 1/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - loss: 0.0204 - mae: 0.0971 - val_loss: 0.0200 - val_mae: 0.0982
Epoch 2/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - loss: 0.0200 - mae: 0.0965 - val_loss: 0.0197 - val_mae: 0.0990
Epoch 3/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - loss: 0.0198 - mae: 0.0960 - val_loss: 0.0194 - val_mae: 0.0909
Epoch 4/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - loss: 0.0197 - mae: 0.0956 - val_loss: 0.0192 - val_mae: 0.0927
Epoch 5/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - loss: 0.0197 - mae: 0.0956 - val_loss: 0.0192 - val_mae: 0.0940
Epoch 6/25
[1m1212/1212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - loss: 0.0196 - mae: 0.0956 - val_loss: 0.0193 - val_mae: 0.0958
Epoch 7

In [114]:
# --- NEW: EVALUATION ---
print("\n--- Model Evaluation ---")
loss, mae = model.evaluate(X_test, y_test, verbose=0)


--- Model Evaluation ---


In [115]:
# To understand the MAE in real days, we must reverse the scaling
# Since days_diff was the first column (index 0) in our scaler:
days_range = scaler.data_range_[0]
real_mae = mae * days_range

print(f"Test Loss (MSE): {loss:.4f}")
print(f"Test Mean Absolute Error (Scaled): {mae:.4f}")
print(f"Average Prediction Error: {real_mae:.2f} days")

Test Loss (MSE): 0.0193
Test Mean Absolute Error (Scaled): 0.0917
Average Prediction Error: 2.57 days


In [116]:
def predict_next_order(equipment_df, model, scaler, window_size, le_marca, le_plant):
    # 1. Create a copy to avoid modifying your original data
    sample = equipment_df.copy()
    
    # 2. Feature Engineering: Dates to Deltas
    sample['created_on'] = pd.to_datetime(sample['created_on'])
    sample = sample.sort_values('created_on')
    sample['days_diff'] = sample['created_on'].diff().dt.days.fillna(0)
    
    # 3. Categorical Encoding
    # We use 'transform' (not fit_transform) to use the training labels
    sample['marca_enc'] = le_marca.transform(sample['marca'].astype(str))
    sample['plant_enc'] = le_plant.transform(sample['plant'].astype(str))
    
    # 4. Select only the columns the model was trained on
    feature_cols = ['days_diff', 'modelo', 'marca_enc', 'plant_enc']
    recent_history = sample[feature_cols].tail(window_size)
    
    if len(recent_history) < window_size:
        return f"Error: Equipment only has {len(recent_history)} orders. Need {window_size}."

    # 5. Scale and Reshape
    scaled_input = scaler.transform(recent_history)
    scaled_input = scaled_input.reshape(1, window_size, len(feature_cols))
    
    # 6. Predict and Inverse Scale
    prediction_scaled = model.predict(scaled_input, verbose=0)
    
    # To un-scale, we create a dummy array matching the scaler's width
    dummy = np.zeros((1, len(feature_cols)))
    dummy[0, 0] = prediction_scaled[0][0]
    prediction_final = scaler.inverse_transform(dummy)[0, 0]
    
    return max(0, prediction_final)

In [117]:
# 1. Calculate days_diff for every row within its own equipment group
# This ensures we don't calculate the gap between "Equipment A" and "Equipment B"
df['created_on'] = pd.to_datetime(df['created_on'])
df = df.sort_values(['equipment', 'created_on'])

# Group by equipment and calculate the difference between orders
df['days_diff'] = df.groupby('equipment')['created_on'].diff().dt.days

# 2. Get the mean for each equipment
avg_per_equipment = df.groupby('equipment')['days_diff'].mean()

# 3. Get the final global average
final_avg = avg_per_equipment.mean()

print(f"Global average order frequency: {final_avg:.2f} days")

Global average order frequency: 12.95 days


In [126]:
days_to_next = predict_next_order(list_of_dfs[21], model, scaler, WINDOW_SIZE, le_marca, le_plant)
print(f"Predicted next order in: {days_to_next:.1f} days")

Predicted next order in: 3.3 days


In [120]:
model.export("basic_model_directory")

INFO:tensorflow:Assets written to: basic_model_directory/assets


INFO:tensorflow:Assets written to: basic_model_directory/assets


Saved artifact at 'basic_model_directory'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 40, 4), dtype=tf.float32, name='keras_tensor_40')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  140399196633808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196633232: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196632464: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196632272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196632848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140398728809936: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196632080: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196631120: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196630544: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196632656: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140399196633

In [128]:
# Save the scalers and encoders
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(le_marca, 'le_marca.pkl')
joblib.dump(le_plant, 'le_plant.pkl')

['le_plant.pkl']