Obtain Training Data

In [1]:
import pandas as pd
orderData = pd.read_csv('./data/synthetic_orders.csv')
productData = pd.read_csv('./data/synthetic_blinkit_order_items.csv')


Join both df with required cols

In [2]:
required_order_data = orderData[['order_id', 'order_date', 'delivery_time','area']]
merged_df = required_order_data.merge(productData, on='order_id', how='left')
merged_df

Unnamed: 0,order_id,order_date,delivery_time,area,product_id,unit_price,quantity
0,100000,2023-09-04,2023-09-04 12:36:00,East,293073,980.63,2
1,100001,2023-09-24,2023-09-24 19:10:00,East,926868,813.33,4
2,100001,2023-09-24,2023-09-24 19:10:00,East,417009,854.38,2
3,100002,2023-06-07,2023-06-07 10:12:00,East,424718,933.48,2
4,100002,2023-06-07,2023-06-07 10:12:00,East,417009,96.02,1
...,...,...,...,...,...,...,...
10050,104998,2023-07-20,2023-07-20 22:54:00,West,177733,197.88,2
10051,104998,2023-07-20,2023-07-20 22:54:00,West,463028,57.60,1
10052,104999,2023-01-21,2023-01-21 17:34:00,Central,131542,145.23,4
10053,104999,2023-01-21,2023-01-21 17:34:00,Central,602517,226.96,1


Feature Engineering

In [3]:
# Use later to estimate revenue
priceData = merged_df[['product_id', 'unit_price']]

# Convert delivery_time to datetime
merged_df['delivery_time'] = pd.to_datetime(
    merged_df['delivery_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
)

# Extract hour (24-hour format)
merged_df['hour'] = merged_df['delivery_time'].dt.hour
print("Hour values:", sorted(merged_df['hour'].unique()))

# Convert order_date to datetime and get day of week
merged_df['order_date'] = pd.to_datetime(merged_df['order_date'])
merged_df['day_of_week'] = merged_df['order_date'].dt.day_name()

# Drop columns to avoid confusion
merged_df.drop(['order_id', 'unit_price', 'delivery_time', 'order_date'], axis=1, inplace=True)

Hour values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [4]:
merged_df.to_csv('mergedData.csv', index=False)

Preprocess

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Encode day_of_week
day_encoder = LabelEncoder()
merged_df['day_of_week'] = day_encoder.fit_transform(merged_df['day_of_week'])

# Features and targets
area_encoder = LabelEncoder()
merged_df['area_encoded'] = area_encoder.fit_transform(merged_df['area'])

X = merged_df[['hour', 'day_of_week', 'area_encoded']]

product_encoder = LabelEncoder()
merged_df['product_label'] = product_encoder.fit_transform(merged_df['product_id'])  # ✅ New
y1 = merged_df['product_label']
num_classes = len(product_encoder.classes_) 
y2 = merged_df['quantity']                                   # regression
y3 = merged_df['area_encoded']

merged_df['quantity_class'] = merged_df['quantity'] - 1  # 0-indexed for softmax

# Targets
y_quantity_class = merged_df['quantity_class']

# Train-test split
X_train_full, X_test_full, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(
    X, y1, y_quantity_class, y3, test_size=0.2
)

X_train = X_train_full[['hour', 'day_of_week']].values  # Final Keras inputs
X_test = X_test_full[['hour', 'day_of_week']].values


Models

In [6]:
import lightgbm as lgb

X_class = merged_df[['hour', 'day_of_week']]
y_class = merged_df['product_label']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_class, y_class, test_size=0.2)
lgbm_clf = lgb.LGBMClassifier()
lgbm_clf.fit(Xc_train, yc_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 8044, number of used features: 2
[LightGBM] [Info] Start training from score -5.437334
[LightGBM] [Info] Start training from score -5.526946
[LightGBM] [Info] Start training from score -5.381764
[LightGBM] [Info] Start training from score -5.355096
[LightGBM] [Info] Start training from score -5.186019
[LightGBM] [Info] Start training from score -5.409163
[LightGBM] [Info] Start training from score -5.437334
[LightGBM] [Info] Start training from score -5.355096
[LightGBM] [Info] Start training from score -5.329120
[LightGBM] [Info] Start training from score -5.526946
[LightGBM] [Info] Start training from score -5.164040
[LightGBM] [Info] Start training from score -5.409163
[LightGBM] [Info] Start training from score -5.466321
[LightGBM] [I

In [7]:
import tensorflow as tf
import numpy as np
from sklearn.utils.class_weight import compute_sample_weight

# Generate sample weights for 'quantity'
sample_weights_quantity = compute_sample_weight(
    class_weight='balanced',
    y=y_quantity_class.iloc[y1_train.index]
)

# Quantity-only model
quantity_model = tf.keras.Sequential([
    tf.keras.Input(shape=(2,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')  # quantity classes: 1–5
])
quantity_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Fit with class weights
quantity_model.fit(
    X_train,
    y_quantity_class.iloc[y1_train.index],
    sample_weight=sample_weights_quantity,
    epochs=20,
    batch_size=32
)


Epoch 1/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 604us/step - accuracy: 0.1984 - loss: 1.6450
Epoch 2/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 571us/step - accuracy: 0.1693 - loss: 1.6184
Epoch 3/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 568us/step - accuracy: 0.1733 - loss: 1.6305
Epoch 4/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 568us/step - accuracy: 0.1801 - loss: 1.6294
Epoch 5/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 568us/step - accuracy: 0.1560 - loss: 1.6327
Epoch 6/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 568us/step - accuracy: 0.2343 - loss: 1.6357
Epoch 7/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 571us/step - accuracy: 0.1847 - loss: 1.6212
Epoch 8/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 582us/step - accuracy: 0.2187 - loss: 1.6141
Epoch 9/20
[1m252/252[

<keras.src.callbacks.history.History at 0x32928b1a0>

In [8]:
# Product + Area model
inputs = tf.keras.Input(shape=(2,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)

# Outputs
product_output = tf.keras.layers.Dense(num_classes, activation='softmax', name='product')(x)
area_output = tf.keras.layers.Dense(len(area_encoder.classes_), activation='softmax', name='area')(x)

multi_model = tf.keras.Model(inputs=inputs, outputs=[product_output, area_output])
multi_model.compile(
    optimizer='adam',
    loss={
        'product': 'sparse_categorical_crossentropy',
        'area': 'sparse_categorical_crossentropy'
    },
    metrics={
        'product': 'accuracy',
        'area': 'accuracy'
    }
)

# Train (no sample weights)
multi_model.fit(
    X_train,
    {
        'product': y1_train,
        'area': y3_train
    },
    epochs=20,
    batch_size=32
)


Epoch 1/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 912us/step - area_accuracy: 0.1881 - area_loss: 1.6516 - loss: 7.0478 - product_accuracy: 0.0057 - product_loss: 5.3962  
Epoch 2/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 947us/step - area_accuracy: 0.2125 - area_loss: 1.6130 - loss: 6.9004 - product_accuracy: 0.0078 - product_loss: 5.2874
Epoch 3/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 926us/step - area_accuracy: 0.2125 - area_loss: 1.6111 - loss: 6.8892 - product_accuracy: 0.0074 - product_loss: 5.2780
Epoch 4/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step - area_accuracy: 0.2107 - area_loss: 1.6113 - loss: 6.8776 - product_accuracy: 0.0084 - product_loss: 5.2662
Epoch 5/20
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step - area_accuracy: 0.2256 - area_loss: 1.6085 - loss: 6.8669 - product_accuracy: 0.0097 - product_loss: 5.2584
Epoch 6/20
[

<keras.src.callbacks.history.History at 0x134a02060>

Prediction Function

In [9]:
def predict_top5_with_quantity_and_area(hour, day_name):
    encoded_day = day_encoder.transform([day_name])[0]
    input_data = np.array([[hour, encoded_day]])

    # Product Prediction (LightGBM)
    product_probs = lgbm_clf.predict_proba(input_data)[0]
    top5_indices = product_probs.argsort()[-5:][::-1]
    top5_product_ids = product_encoder.inverse_transform(top5_indices)
    top5_confidences = product_probs[top5_indices]

    # Quantity Prediction (Keras model)
    quantity_probs = quantity_model.predict(input_data, verbose=0)
    quantity = np.argmax(quantity_probs[0]) + 1

    # Area Prediction (from multi-output model)
    _, area_probs = multi_model.predict(input_data, verbose=0)
    area_idx = np.argmax(area_probs[0])
    predicted_area = area_encoder.inverse_transform([area_idx])[0]

    # Output
    return [
        {
            'product_id': pid,
            'confidence': round(float(conf), 4),
            'predicted_quantity': quantity,
            'predicted_area': predicted_area
        }
        for pid, conf in zip(top5_product_ids, top5_confidences)
    ]


Making Predictions

In [12]:
results = predict_top5_with_quantity_and_area(19, 'Monday')
for i, r in enumerate(results, 1):
    print(f"{i}. Product ID: {r['product_id']} | Confidence: {r['confidence']:.4f} | Quantity: {r['predicted_quantity']} | Area: {r['predicted_area']}")


1. Product ID: 127005 | Confidence: 0.0601 | Quantity: 2 | Area: South
2. Product ID: 458263 | Confidence: 0.0392 | Quantity: 2 | Area: South
3. Product ID: 263679 | Confidence: 0.0391 | Quantity: 2 | Area: South
4. Product ID: 115348 | Confidence: 0.0201 | Quantity: 2 | Area: South
5. Product ID: 712158 | Confidence: 0.0200 | Quantity: 2 | Area: South


