In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

### Charger les données et retraiter ORDERDATE

In [35]:
df = pd.read_csv("../data/sales_data_sample.csv", encoding='latin1')
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])

# Création de features temporelles
df['YEAR'] = df['ORDERDATE'].dt.year
df['MONTH'] = df['ORDERDATE'].dt.month
df['QUARTER'] = df['ORDERDATE'].dt.quarter

### Sélection des colonnes utiles
##### On supprime les colonnes inutilisables (texte libre, noms, etc.)

In [36]:
columns_to_drop = ['ORDERNUMBER', 'ORDERDATE', 'CUSTOMERNAME', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME', 
                   'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'POSTALCODE', 'PRODUCTCODE']

df = df.drop(columns=columns_to_drop)

### Séparer les variables numériques et catégorielles

In [37]:
# Cible
y = df['SALES']

# Variables explicatives
X = df.drop('SALES', axis=1)

cat_cols = X.select_dtypes(include='object').columns

In [38]:
joblib.dump(cat_cols.tolist(), "../models/categorical_columns.pkl")

['../models/categorical_columns.pkl']

### Encodage des variables catégorielles (OneHot)

In [39]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[cat_cols]), columns=encoder.get_feature_names_out(cat_cols))

X_final = pd.concat([X.drop(columns=cat_cols).reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)
joblib.dump(X_final.columns.tolist(), "../models/feature_columns.pkl")



['../models/feature_columns.pkl']

### Split train / test

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

### Modèle : Random Forest Regressor

In [41]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

### Évaluation du modèle

In [42]:
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R²:", r2_score(y_test, y_pred))

MAE: 281.3965100884955
RMSE: 597.6191869074158
R²: 0.9181769347014656


### Sauvegarder le modèle et l’encodeur

In [43]:
joblib.dump(model, "../models/sales_model.pkl")

['../models/sales_model.pkl']

In [44]:
joblib.dump(encoder, "../models/encoder.pkl")

['../models/encoder.pkl']