# 🤖 Rossmann Sales Prediction - Modèles Comparés (XGBoost, Random Forest, LinReg)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [None]:
# Chargement des données
train_df = pd.read_csv("/content/train.csv", parse_dates=["Date"], dtype={"StateHoliday": str})
store_df = pd.read_csv("/content/store.csv")

# Fusion et nettoyage de base
df = pd.merge(train_df, store_df, on="Store", how="left")
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
df['StateHoliday'] = df['StateHoliday'].astype(str)
df['PromoInterval'].fillna("None", inplace=True)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df = df[df['Open'] == 1]  # Exclure les jours où le magasin est fermé
df['Sales'] = df['Sales'].clip(upper=df['Sales'].quantile(0.99))  # Limiter les outliers
df.drop(['Customers', 'Date'], axis=1, inplace=True)


In [None]:
# Encodage des colonnes catégorielles
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop("Sales", axis=1)
y = df["Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression - RMSE: {rmse_lr:.2f} | MAE: {mae_lr:.2f}")


In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest - RMSE: {rmse_rf:.2f} | MAE: {mae_rf:.2f}")


In [None]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost - RMSE: {rmse_xgb:.2f} | MAE: {mae_xgb:.2f}")


In [None]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'RMSE': [rmse_lr, rmse_rf, rmse_xgb],
    'MAE': [mae_lr, mae_rf, mae_xgb]
})
sns.barplot(data=results.melt(id_vars='Model'), x='Model', y='value', hue='variable')
plt.title("Comparaison des Modèles")
plt.ylabel("Erreur")
plt.show()
