In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.stats.mstats import mquantiles
from statsmodels.tsa.statespace.sarimax import SARIMAX


# Generate Test Dataset

In [12]:
# Set a seed for reproducibility
np.random.seed(42)

# Number of rows in the random DataFrame
num_rows = 10000

# Define the date range: start at 2009-01-01, end at 2011-12-31
start_date = datetime(2009, 1, 1)
end_date = datetime(2011, 12, 31)
delta_days = (end_date - start_date).days

# Introduce a seasonal pattern in order volume
season_weights = {
    1: 1.5, 2: 1.5, 12: 1.5,  # Winter months have a higher weight
    3: 1.2, 4: 1.2, 5: 1.2,   # Spring medium
    6: 0.8, 7: 0.8, 8: 0.8,   # Summer lower
    9: 1.3, 10:1.3, 11:1.3    # Autumn medium-high
}

all_days = pd.date_range(start_date, end_date, freq='D')
day_months = all_days.month

weights = np.array([season_weights[m] for m in day_months])
weights = weights / weights.sum()  # normalize to create probabilities

order_dates = np.random.choice(all_days, size=num_rows, p=weights)

# Use pd.Timedelta instead of datetime.timedelta
requested_delivery_dates = [
    od + pd.Timedelta(days=np.random.randint(1,30)) for od in order_dates
]

# Country Codes (example)
country_codes = ['US', 'DE', 'FR', 'UK', 'IT', 'ES', 'CN', 'JP', 'IN', 'BR']
customer_country_codes = np.random.choice(country_codes, size=num_rows)

# Generate fewer unique product codes
def random_product_code(length=6):
    return ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), length))

unique_product_codes = [random_product_code() for _ in range(5)]

order_dates = pd.to_datetime(order_dates)

# Introduce seasonal bias in product selection
def product_for_date(d):
    m = d.month
    if m in [12,1,2]:    # Winter: favor product_codes[0] & product_codes[1]
        probs = [0.4,0.4,0.05,0.05,0.1]
    elif m in [6,7,8]:   # Summer: favor product_codes[2] & product_codes[3]
        probs = [0.05,0.05,0.4,0.4,0.1]
    else:                # Other months: more even distribution
        probs = [0.2,0.2,0.2,0.2,0.2]
    return np.random.choice(unique_product_codes, p=probs)

product_codes = [product_for_date(d) for d in order_dates]


# Descriptions
descriptions_list = ["Widget A", "Widget B", "Gadget C", "Gizmo D", "Tool E", "Part F"]
descriptions = np.random.choice(descriptions_list, size=num_rows)

# Order type always "VO"
order_types = ["VO"] * num_rows

# Customer Order Code
unique_order_count = 2000
unique_order_codes = [f"ORD{str(i).zfill(5)}" for i in range(1, unique_order_count+1)]
customer_order_codes = np.random.choice(unique_order_codes, size=num_rows)

# Trend in Value and Items
days_from_start = (pd.to_datetime(order_dates) - start_date).days
base_values = np.random.uniform(10, 1000, size=num_rows)
value_trend = 1 + (days_from_start / days_from_start.max()) * 0.2
values = np.round(base_values * value_trend, 2)

items = []
for od in order_dates:
    if od.month in [12,1,2]:    # winter: larger orders
        items.append(np.random.randint(50, 100))
    elif od.month in [6,7,8]:   # summer: smaller orders
        items.append(np.random.randint(1, 50))
    else:
        items.append(np.random.randint(20, 80))
items = np.array(items)

# Currency (example)
currencies = ["USD", "EUR", "GBP", "JPY", "CNY"]
currency_codes = np.random.choice(currencies, size=num_rows)

# Route
routes = [f"{cc}-{np.random.randint(1,10)}" for cc in customer_country_codes]

# Create the DataFrame
df = pd.DataFrame({
    "Order Date": [d.strftime("%d.%m.%Y") for d in order_dates],
    "Requested Delivery Date": [d.strftime("%d.%m.%Y") for d in requested_delivery_dates],
    "Customer Country Code": customer_country_codes,
    "Product Code": product_codes,
    "Description": descriptions,
    "Order type": order_types,
    "Customer Order Code": customer_order_codes,
    "Value": values,
    "Currency": currency_codes,
    "Items": items,
    "Route": routes
})

print(df.head())
print("Number of unique product codes:", df["Product Code"].nunique())

   Order Date Requested Delivery Date Customer Country Code Product Code  \
0  06.02.2010              01.03.2010                    FR       85L8C7   
1  17.11.2011              21.11.2011                    US       H5MOF6   
2  27.02.2011              17.03.2011                    UK       J94VBW   
3  29.10.2010              03.11.2010                    IT       J94VBW   
4  08.06.2009              24.06.2009                    DE       XQAHOY   

  Description Order type Customer Order Code    Value Currency  Items Route  
0     Gizmo D         VO            ORD00693   441.44      GBP     78  FR-7  
1      Part F         VO            ORD01883   743.82      EUR     22  US-5  
2    Widget A         VO            ORD00250  1036.22      GBP     58  UK-2  
3    Widget A         VO            ORD01261    34.01      JPY     54  IT-8  
4     Gizmo D         VO            ORD00904   682.16      EUR      6  DE-3  
Number of unique product codes: 5


In [18]:
# Convert "Order Date" to datetime if not already
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d.%m.%Y', errors='coerce')

# Create copies to avoid SettingWithCopyWarning
train_df = df.loc[(df['Order Date'] >= '2009-01-01') & (df['Order Date'] < '2011-01-01')].copy()
test_df = df.loc[(df['Order Date'] >= '2011-01-01') & (df['Order Date'] <= '2011-12-31')].copy()

############################################
# Step A: Forecast Monthly Distinct Orders using the Training Set
############################################

train_df['Year'] = train_df['Order Date'].dt.year
train_df['Month'] = train_df['Order Date'].dt.month

monthly_train = (train_df.groupby(['Year','Month'])['Customer Order Code']
                 .nunique()
                 .reset_index(name='Distinct_Order_Count'))

# Create a Year-Month index
monthly_train['Year-Month'] = pd.to_datetime(monthly_train['Year'].astype(str) + '-' + monthly_train['Month'].astype(str) + '-01')
monthly_train.set_index('Year-Month', inplace=True)
monthly_train.sort_index(inplace=True)
ts_train = monthly_train['Distinct_Order_Count'].asfreq('MS')

# If you want to capture a 4-season cycle in monthly data:
model = SARIMAX(ts_train.dropna(),
                order=(1,1,1),          # non-seasonal parameters
                seasonal_order=(1,1,1,4), # seasonal parameters with s=4
                enforce_stationarity=False,
                enforce_invertibility=False)
res = model.fit()

steps_ahead = 12  # forecasting all months of 2011
forecast_res = res.get_forecast(steps=steps_ahead)
forecast_mean = forecast_res.predicted_mean

monthly_forecasts = pd.DataFrame({
    'Year-Month': forecast_mean.index,
    'Predicted_Order_Count': forecast_mean.values
}).set_index('Year-Month')

############################################
# Step B: Classification Model (Train on 2009â€“2010 data)
############################################

train_df['Month'] = train_df['Order Date'].dt.month

def month_to_season(m):
    if m in [12, 1, 2]:
        return 'Winter'
    elif m in [3,4,5]:
        return 'Spring'
    elif m in [6,7,8]:
        return 'Summer'
    else:
        return 'Autumn'

train_df['Season'] = train_df['Month'].apply(month_to_season)

categorical_features = ['Season','Customer Country Code','Currency','Route','Order type']
numeric_features = ['Items','Value']

train_df[categorical_features] = train_df[categorical_features].fillna('Unknown')
train_df[numeric_features] = train_df[numeric_features].fillna(0)

train_encoded = pd.get_dummies(train_df[categorical_features], drop_first=True)
X_train = pd.concat([train_df[numeric_features], train_encoded], axis=1)
y_train = train_df['Product Code']

# Increase max_iter to ensure convergence
model_classification = LogisticRegression(random_state=0, solver='saga', max_iter=5000)
model_classification.fit(X_train, y_train)

product_classes = model_classification.classes_
known_feature_cols = X_train.columns

############################################
# Compute Product Quantiles for Quantity Distribution from Training Data
############################################
product_quantiles = {}
quantile_levels = [0.1, 0.25, 0.5, 0.75, 0.9]
for product, group in train_df.groupby('Product Code'):
    product_items = group['Items'].values
    q_values = mquantiles(product_items, prob=quantile_levels)
    product_quantiles[product] = {
        'quantile_levels': quantile_levels,
        'quantile_values': q_values
    }

############################################
# Apply Models to the Test Period (2011)
############################################

historical_feature_distributions = {
    'Customer Country Code': train_df['Customer Country Code'].value_counts(normalize=True).to_dict(),
    'Currency': train_df['Currency'].value_counts(normalize=True).to_dict(),
    'Route': train_df['Route'].value_counts(normalize=True).to_dict(),
    'Order type': train_df['Order type'].value_counts(normalize=True).to_dict()
}

def simulate_future_orders_for_month(year_month, n_orders, distributions, known_feature_cols):
    month = year_month.month
    season = month_to_season(month)

    simulated_orders = pd.DataFrame()
    simulated_orders['Month'] = [month]*n_orders
    simulated_orders['Season'] = [season]*n_orders

    for cat_feat, dist in distributions.items():
        cats = list(dist.keys())
        probs = list(dist.values())
        simulated_orders[cat_feat] = np.random.choice(cats, size=n_orders, p=probs)

    simulated_orders['Items'] = np.random.randint(1,50,size=n_orders)
    simulated_orders['Value'] = np.round(np.random.uniform(10,300,n_orders),2)

    simulated_encoded = pd.get_dummies(simulated_orders, drop_first=True)
    for col in known_feature_cols:
        if col not in simulated_encoded.columns:
            simulated_encoded[col] = 0
    simulated_encoded = simulated_encoded[known_feature_cols]
    return simulated_encoded

final_forecast = []
for future_month, row in monthly_forecasts.iterrows():
    predicted_order_count = int(round(row['Predicted_Order_Count']))
    if predicted_order_count <= 0:
        continue
    simulated_orders_for_month = simulate_future_orders_for_month(future_month, predicted_order_count, historical_feature_distributions, known_feature_cols)
    product_prob = model_classification.predict_proba(simulated_orders_for_month)
    expected_counts = product_prob.sum(axis=0)
    monthly_product_forecast = pd.DataFrame({
        'Year-Month': future_month,
        'Product': product_classes,
        'Expected_Count': expected_counts
    })
    final_forecast.append(monthly_product_forecast)

final_forecast = pd.concat(final_forecast, ignore_index=True)

############################################
# Simulate Quantities for the Forecasted Products
############################################

final_forecast_with_quantities = []
for idx, row in final_forecast.iterrows():
    product = row['Product']
    month = row['Year-Month']
    expected_count = int(round(row['Expected_Count']))

    if expected_count <= 0:
        continue

    if product in product_quantiles:
        q_data = product_quantiles[product]
        q_levels = q_data['quantile_levels']
        q_values = q_data['quantile_values']

        simulated_quantities = []
        for _ in range(expected_count):
            u = np.random.rand()
            for i, ql in enumerate(q_levels):
                if u < ql:
                    simulated_quantities.append(int(np.round(q_values[i])))
                    break
            else:
                simulated_quantities.append(int(np.round(q_values[-1])))

        total_quantity = sum(simulated_quantities)
        final_forecast_with_quantities.append({
            'Year-Month': month,
            'Product': product,
            'Expected_Count': expected_count,
            'Total_Quantity_Demanded': total_quantity
        })
    else:
        avg_items = train_df.loc[train_df['Product Code']==product,'Items'].mean()
        total_quantity = avg_items*expected_count
        final_forecast_with_quantities.append({
            'Year-Month': month,
            'Product': product,
            'Expected_Count': expected_count,
            'Total_Quantity_Demanded': total_quantity
        })

final_forecast_with_quantities = pd.DataFrame(final_forecast_with_quantities)

############################################
# Compute Actuals for 2011 and MAPE
############################################

test_df['Year'] = test_df['Order Date'].dt.year
test_df['Month'] = test_df['Order Date'].dt.month
test_monthly = (test_df.groupby([test_df['Order Date'].dt.to_period('M'), 'Product Code'])['Items']
                .sum()
                .reset_index())
test_monthly['Year-Month'] = test_monthly['Order Date'].dt.to_timestamp('D')
test_monthly.rename(columns={'Product Code':'Product','Items':'Actual_Total_Quantity'},inplace=True)
test_monthly.drop(columns='Order Date',inplace=True)
test_monthly.set_index(['Year-Month','Product'],inplace=True)

final_forecast_with_quantities.set_index(['Year-Month','Product'], inplace=True)

comparison_df = final_forecast_with_quantities.join(test_monthly, how='inner')
comparison_df.reset_index(inplace=True)

comparison_df = comparison_df[comparison_df['Actual_Total_Quantity'] > 0]

comparison_df['APE'] = np.abs((comparison_df['Actual_Total_Quantity'] - comparison_df['Total_Quantity_Demanded'])
                              / comparison_df['Actual_Total_Quantity'])*100
mape = comparison_df['APE'].mean()
print("MAPE on Test Data (2011):", mape, "%")

MAPE on Test Data (2011): 223.98171861680066 %
