In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.stats.mstats import mquantiles
from statsmodels.tsa.statespace.sarimax import SARIMAX


In [2]:
# Set a seed for reproducibility
np.random.seed(42)

# Number of rows in the random DataFrame
num_rows = 10000

# Define the date range: start at 2009-01-01, end at 2011-12-31
start_date = datetime(2009, 1, 1)
end_date = datetime(2011, 12, 31)
delta_days = (end_date - start_date).days

# Introduce a seasonal pattern in order volume
season_weights = {
    1: 1.5, 2: 1.5, 12: 1.5,  # Winter months have a higher weight
    3: 1.2, 4: 1.2, 5: 1.2,   # Spring medium
    6: 0.8, 7: 0.8, 8: 0.8,   # Summer lower
    9: 1.3, 10:1.3, 11:1.3    # Autumn medium-high
}

all_days = pd.date_range(start_date, end_date, freq='D')
day_months = all_days.month

weights = np.array([season_weights[m] for m in day_months])
weights = weights / weights.sum()  # normalize to create probabilities

order_dates = np.random.choice(all_days, size=num_rows, p=weights)

# Use pd.Timedelta instead of datetime.timedelta
requested_delivery_dates = [
    od + pd.Timedelta(days=np.random.randint(1,30)) for od in order_dates
]

# Country Codes (example)
country_codes = ['US', 'DE', 'FR', 'UK', 'IT', 'ES', 'CN', 'JP', 'IN', 'BR']
customer_country_codes = np.random.choice(country_codes, size=num_rows)

# Generate fewer unique product codes
def random_product_code(length=6):
    return ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), length))

unique_product_codes = [random_product_code() for _ in range(5)]

order_dates = pd.to_datetime(order_dates)

# Introduce seasonal bias in product selection
def product_for_date(d):
    m = d.month
    if m in [12,1,2]:    # Winter: favor product_codes[0] & product_codes[1]
        probs = [0.4,0.4,0.05,0.05,0.1]
    elif m in [6,7,8]:   # Summer: favor product_codes[2] & product_codes[3]
        probs = [0.05,0.05,0.4,0.4,0.1]
    else:                # Other months: more even distribution
        probs = [0.2,0.2,0.2,0.2,0.2]
    return np.random.choice(unique_product_codes, p=probs)

product_codes = [product_for_date(d) for d in order_dates]


# Descriptions
descriptions_list = ["Widget A", "Widget B", "Gadget C", "Gizmo D", "Tool E", "Part F"]
descriptions = np.random.choice(descriptions_list, size=num_rows)

# Order type always "VO"
order_types = ["VO"] * num_rows

# Customer Order Code
unique_order_count = 2000
unique_order_codes = [f"ORD{str(i).zfill(5)}" for i in range(1, unique_order_count+1)]
customer_order_codes = np.random.choice(unique_order_codes, size=num_rows)

# Trend in Value and Items
days_from_start = (pd.to_datetime(order_dates) - start_date).days
base_values = np.random.uniform(10, 1000, size=num_rows)
value_trend = 1 + (days_from_start / days_from_start.max()) * 0.2
values = np.round(base_values * value_trend, 2)

items = []
for od in order_dates:
    if od.month in [12,1,2]:    # winter: larger orders
        items.append(np.random.randint(50, 100))
    elif od.month in [6,7,8]:   # summer: smaller orders
        items.append(np.random.randint(1, 50))
    else:
        items.append(np.random.randint(20, 80))
items = np.array(items)

# Currency (example)
currencies = ["USD", "EUR", "GBP", "JPY", "CNY"]
currency_codes = np.random.choice(currencies, size=num_rows)

# Route
routes = [f"{cc}-{np.random.randint(1,10)}" for cc in customer_country_codes]

# Create the DataFrame
data = pd.DataFrame({
    "Order Date": [d.strftime("%d.%m.%Y") for d in order_dates],
    "Requested Delivery Date": [d.strftime("%d.%m.%Y") for d in requested_delivery_dates],
    "Customer Country Code": customer_country_codes,
    "Product Code": product_codes,
    "Description": descriptions,
    "Order type": order_types,
    "Customer Order Code": customer_order_codes,
    "Value": values,
    "Currency": currency_codes,
    "Items": items,
    "Route": routes
})

print(data.head())
print("Number of unique product codes:", data["Product Code"].nunique())

   Order Date Requested Delivery Date Customer Country Code Product Code  \
0  06.02.2010              01.03.2010                    FR       85L8C7   
1  17.11.2011              21.11.2011                    US       H5MOF6   
2  27.02.2011              17.03.2011                    UK       J94VBW   
3  29.10.2010              03.11.2010                    IT       J94VBW   
4  08.06.2009              24.06.2009                    DE       XQAHOY   

  Description Order type Customer Order Code    Value Currency  Items Route  
0     Gizmo D         VO            ORD00693   441.44      GBP     78  FR-7  
1      Part F         VO            ORD01883   743.82      EUR     22  US-5  
2    Widget A         VO            ORD00250  1036.22      GBP     58  UK-2  
3    Widget A         VO            ORD01261    34.01      JPY     54  IT-8  
4     Gizmo D         VO            ORD00904   682.16      EUR      6  DE-3  
Number of unique product codes: 5


1. Distinct Monthly Orders (Time Series Forecasting)

In [3]:
# Import libraries
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Group data to calculate monthly distinct orders
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d.%m.%Y')
data['Month'] = data['Order Date'].dt.to_period('M')
monthly_orders = data.groupby('Month')['Customer Order Code'].nunique().reset_index()

# Prepare data for SARIMA
monthly_orders.set_index('Month', inplace=True)
sarima_model = SARIMAX(monthly_orders, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarima_fit = sarima_model.fit(disp=False)
forecast = sarima_fit.get_forecast(steps=5).predicted_mean


  warn('Too few observations to estimate starting parameters%s.'


2. Classification Model (Choice Model)

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Encode categorical features
data['Season'] = data['Order Date'].dt.month % 12 // 3 + 1  # Map months to seasons
encoder = OneHotEncoder(sparse_output=False)
season_encoded = encoder.fit_transform(data[['Season']])

# Prepare features and target
X = np.hstack((season_encoded, pd.get_dummies(data[['Route', 'Customer Country Code']])))
y = data['Product Code']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
classifier.fit(X_train, y_train)

# Evaluate model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      85L8C7       0.33      0.43      0.38       488
      G625CT       0.18      0.09      0.12       319
      H5MOF6       0.30      0.29      0.30       399
      J94VBW       0.33      0.32      0.33       437
      XQAHOY       0.27      0.31      0.28       357

    accuracy                           0.30      2000
   macro avg       0.28      0.29      0.28      2000
weighted avg       0.29      0.30      0.29      2000





3. Quantity Demanded (Regression)

In [6]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error
from scipy.stats import mstats

# Empirical quantiles for quantity
quantiles = mstats.mquantiles(data['Items'], prob=[0.25, 0.5, 0.75])

# Features and target
X = pd.get_dummies(data[['Customer Country Code', 'Route', 'Value']])
y = data['Items']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Lasso model
lasso = LassoCV(cv=5).fit(X_train, y_train)

# Evaluate model
y_pred = lasso.predict(X_test)
print(mean_absolute_error(y_test, y_pred))


19.172532625


4. Demand Lead Time

In [8]:
# Calculate lead time
data['Requested Delivery Date'] = pd.to_datetime(data['Requested Delivery Date'], format='%d.%m.%Y')
data['Lead Time'] = (data['Requested Delivery Date'] - data['Order Date']).dt.days

# Empirical quantiles for lead time
lead_time_quantiles = mstats.mquantiles(data['Lead Time'], prob=[0.25, 0.5, 0.75])

print(lead_time_quantiles)


[ 8. 15. 22.]


5. Monte Carlo Simulation

In [10]:
# Monte Carlo Simulation
simulated_demand = []
for _ in range(1000):  # Run 1000 simulations
    orders = np.random.choice(monthly_orders['Customer Order Code'], size=12, replace=True)
    quantities = np.random.normal(loc=quantiles[1], scale=quantiles[2]-quantiles[0], size=12)
    lead_times = np.random.normal(loc=lead_time_quantiles[1], scale=lead_time_quantiles[2]-lead_time_quantiles[0], size=12)
    simulated_demand.append((orders, quantities, lead_times))

print(simulated_demand)

[(array([260, 332, 291, 181, 256, 301, 303, 204, 291, 285, 309, 325],
      dtype=int64), array([ 72.30393307,  58.75598024, -11.32542675,  31.91003399,
       114.86407763,  86.75915305,  26.94696925,  47.81571851,
        45.19276081, -32.22446335, 124.16475881,  60.76453867]), array([ 24.11932342,  48.25038071,  39.13364177,  21.56262684,
        20.71015079,  36.10480779,   1.84463967,   9.67224051,
       -11.3269124 ,  25.4261481 ,  15.45849765,  34.42377251])), (array([249, 190, 167, 291, 276, 183, 270, 176, 313, 176, 269, 321],
      dtype=int64), array([ 43.68491824, 108.10011339,  20.72484175,  94.22480928,
       126.77107229,  68.18289367,  86.70066192,  11.2412454 ,
        65.40652875, 100.42147518,  73.89979834,  37.03331755]), array([20.68535623, 10.88475794, 34.23018928,  1.32144543, 14.60664458,
       39.66701157,  9.27049275, 29.85258779, 18.1175746 , 30.76501663,
       14.14699915,  7.04216894])), (array([281, 313, 265, 342, 190, 181, 303, 226, 303, 183, 176, 284]

6. Advance vs. Urgent Demand

In [None]:
# Distinguish advance vs. urgent demand
actual_advance_demand = test_data[test_data['Lead Time'] >= 40]  # Assuming threshold
simulated_advance_demand = [d for d in simulated_demand if d[2] >= 40]
