In [3]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Settings for synthetic data
num_rows = 2421
start_date = datetime(2009, 1, 1)
end_date = datetime(2011, 12, 31)
num_countries = 10
num_products = 15
num_routes = 5

# Generate synthetic data
np.random.seed(0)  # For reproducibility
dates = pd.date_range(start_date, end_date).to_series()
df = pd.DataFrame({
    'Order Date': np.random.choice(dates, num_rows),
    'Customer Country Code': np.random.choice(range(1, num_countries + 1), num_rows),
    'Product Code': np.random.choice(['A', 'B', 'C'], num_rows),
    'Description': ['Parka' for _ in range(num_rows)],
    'Order Type': ['VO' for _ in range(num_rows)],
    'Customer Order Code': np.random.randint(1000, 9999, num_rows),
    'Value': np.random.uniform(50, 500, num_rows),
    'Currency': ['EUR' for _ in range(num_rows)],
    'Items': np.random.randint(1, 10, num_rows),
    'Route': np.random.choice(range(1, num_routes + 1), num_rows)
})


In [4]:
df

Unnamed: 0,Order Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route
0,2010-11-16,3,A,Parka,VO,6135,292.083301,EUR,7,4
1,2010-07-14,9,B,Parka,VO,7010,298.687003,EUR,6,4
2,2011-04-16,8,A,Parka,VO,1609,140.831803,EUR,5,4
3,2011-02-03,9,C,Parka,VO,3125,387.690901,EUR,2,3
4,2011-10-31,7,B,Parka,VO,7228,453.279567,EUR,3,4
...,...,...,...,...,...,...,...,...,...,...
2416,2011-11-27,1,A,Parka,VO,8435,290.111246,EUR,3,5
2417,2010-02-09,10,B,Parka,VO,1180,431.230312,EUR,7,2
2418,2010-07-05,5,C,Parka,VO,5164,199.100608,EUR,2,2
2419,2010-01-24,6,C,Parka,VO,5895,54.029092,EUR,5,1


In [5]:
# Add 'Requested Delivery Date' which is some days after 'Order Date'
delivery_lead_times = np.random.randint(1, 60, num_rows)
df['Requested Delivery Date'] = df['Order Date'] + pd.to_timedelta(delivery_lead_times, unit='d')

# Introduce binary variable for advance demand
df['IsAdvanceOrder'] = (df['Requested Delivery Date'] - df['Order Date']).dt.days > 30
df['IsAdvanceOrder'] = df['IsAdvanceOrder'].astype(int)

# Introduce 'Season' variable (for illustration purposes)
df['Season'] = df['Order Date'].dt.month // 4

# Group demand by 'Delivery Month' and include a binary variable for advance and urgent demand
df['Delivery Month'] = df['Requested Delivery Date'].dt.to_period('M')
df['IsAdvanceUrgent'] = (df['IsAdvanceOrder']) & (df['Requested Delivery Date'] < df['Order Date'])
df['IsAdvanceUrgent'] = df['IsAdvanceUrgent'].astype(int)

# Define predictor variables (features) and the target variable ('Product Code')
features = ['Value', 'Season', 'IsAdvanceOrder', 'IsAdvanceUrgent']
target_variable = 'Product Code'

In [6]:
df

Unnamed: 0,Order Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route,Requested Delivery Date,IsAdvanceOrder,Season,Delivery Month,IsAdvanceUrgent
0,2010-11-16,3,A,Parka,VO,6135,292.083301,EUR,7,4,2011-01-10,1,2,2011-01,0
1,2010-07-14,9,B,Parka,VO,7010,298.687003,EUR,6,4,2010-07-18,0,1,2010-07,0
2,2011-04-16,8,A,Parka,VO,1609,140.831803,EUR,5,4,2011-06-14,1,1,2011-06,0
3,2011-02-03,9,C,Parka,VO,3125,387.690901,EUR,2,3,2011-02-27,0,0,2011-02,0
4,2011-10-31,7,B,Parka,VO,7228,453.279567,EUR,3,4,2011-11-06,0,2,2011-11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2416,2011-11-27,1,A,Parka,VO,8435,290.111246,EUR,3,5,2012-01-16,1,2,2012-01,0
2417,2010-02-09,10,B,Parka,VO,1180,431.230312,EUR,7,2,2010-03-10,0,0,2010-03,0
2418,2010-07-05,5,C,Parka,VO,5164,199.100608,EUR,2,2,2010-07-21,0,1,2010-07,0
2419,2010-01-24,6,C,Parka,VO,5895,54.029092,EUR,5,1,2010-02-12,0,0,2010-02,0


In [7]:
df_monthly = df.groupby([df['Order Date'].dt.to_period('M')])[['Value', 'Season', 'IsAdvanceOrder', 'IsAdvanceUrgent', 'Product Code']].mean().reset_index()

# Split the data into training and testing sets
X = df_monthly[features]
y = df_monthly[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

TypeError: Could not convert AACBBCCBCCCCACBABAAABCAABCCAAACBBBBCBCBCACCAABCBABABBAACCCA to numeric

In [12]:
df

Unnamed: 0,Order Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route,Requested Delivery Date,IsAdvanceOrder,Season,Delivery Month,IsAdvanceUrgent
0,2010-11-16,3,A,Parka,VO,6135,292.083301,EUR,7,4,2011-01-10,1,2,2011-01,0
1,2010-07-14,9,B,Parka,VO,7010,298.687003,EUR,6,4,2010-07-18,0,1,2010-07,0
2,2011-04-16,8,A,Parka,VO,1609,140.831803,EUR,5,4,2011-06-14,1,1,2011-06,0
3,2011-02-03,9,C,Parka,VO,3125,387.690901,EUR,2,3,2011-02-27,0,0,2011-02,0
4,2011-10-31,7,B,Parka,VO,7228,453.279567,EUR,3,4,2011-11-06,0,2,2011-11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2416,2011-11-27,1,A,Parka,VO,8435,290.111246,EUR,3,5,2012-01-16,1,2,2012-01,0
2417,2010-02-09,10,B,Parka,VO,1180,431.230312,EUR,7,2,2010-03-10,0,0,2010-03,0
2418,2010-07-05,5,C,Parka,VO,5164,199.100608,EUR,2,2,2010-07-21,0,1,2010-07,0
2419,2010-01-24,6,C,Parka,VO,5895,54.029092,EUR,5,1,2010-02-12,0,0,2010-02,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2421 entries, 0 to 2420
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Order Date               2421 non-null   datetime64[ns]
 1   Customer Country Code    2421 non-null   int64         
 2   Product Code             2421 non-null   object        
 3   Description              2421 non-null   object        
 4   Order Type               2421 non-null   object        
 5   Customer Order Code      2421 non-null   int64         
 6   Value                    2421 non-null   float64       
 7   Currency                 2421 non-null   object        
 8   Items                    2421 non-null   int64         
 9   Route                    2421 non-null   int64         
 10  Requested Delivery Date  2421 non-null   datetime64[ns]
 11  IsAdvanceOrder           2421 non-null   int64         
 12  Season                   2421 non-