In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import statsmodels.api as sm
from statsmodels.stats.api import het_breuschpagan
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin

In [3]:
df = pd.read_excel('/Users/lucasginevro/Downloads/data_set_hackathon.xlsx')

In [7]:
# Convert 'order_date' to datetime
df['order_date'] = pd.to_datetime(df['order_date'], format='%d.%m.%Y')

# Extract year and month
df['Year-Month'] = df['order_date'].dt.to_period('M')

# Group by 'Year-Month' and count distinct 'Customer Order Code'
monthly_distinct_orders = (
    df.groupby('Year-Month')['Customer Order Code']
    .nunique()
    .reset_index(name='Distinct Orders')
)

print(monthly_distinct_orders)

   Year-Month  Distinct Orders
0     2009-07               38
1     2009-08                9
2     2009-09               12
3     2009-10                4
4     2009-11                2
5     2009-12               21
6     2010-01               29
7     2010-02               36
8     2010-03               43
9     2010-04               11
10    2010-05                1
11    2010-06                1
12    2010-12               34
13    2011-01              124
14    2011-02               53
15    2011-03               29
16    2011-04                6
17    2011-05                4
18    2011-06                6
19    2011-07              238
20    2011-08              186
21    2011-09               89
22    2011-10               13
23    2011-11                3
24    2012-01                1
25    2012-04                1
26    2012-11                1


In [15]:
average = monthly_distinct_orders['Distinct Orders'].mean()
print("Average:", average)


Average: 36.851851851851855


In [18]:
total = monthly_distinct_orders['Distinct Orders'].sum()
print("Total Distinct Orders:", total)

Total Distinct Orders: 995


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

# Convert 'order_date' to datetime
df['order_date'] = pd.to_datetime(df['order_date'], format='%d.%m.%Y')

# Create time-based features
df['month'] = df['order_date'].dt.month
df['year'] = df['order_date'].dt.year

In [49]:
# Clean and ensure 'Items' is numeric
df['items'] = pd.to_numeric(df['items'], errors='coerce').fillna(0).astype(int)
df = df.dropna(subset=['items'])
df['items'] = df['items'].astype(int)

In [51]:
# Encode categorical variables
label_encoders = {}
for col in ['Product Code', 'Customer Country Code', 'Route']:
    df[col] = df[col].astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [53]:
# Define features and target
X = df[['Product Code', 'Customer Country Code', 'month', 'year', 'Route']]
y = df['items'] > df['items'].median()  

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

In [55]:
# Evaluate the model
report = classification_report(y_test, y_pred, target_names=["Low Demand", "High Demand"])
print(report)

# Feature importance analysis
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False)

              precision    recall  f1-score   support

  Low Demand       0.79      0.78      0.78       242
 High Demand       0.69      0.71      0.70       173

    accuracy                           0.75       415
   macro avg       0.74      0.74      0.74       415
weighted avg       0.75      0.75      0.75       415

