In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df= pd.read_csv("data_brief_2.csv")

df.head()
df.info()
df.describe(include='all')


In [None]:
df.isnull().sum()


In [None]:
df.fillna({
    'Weather': df['Weather'].mode()[0],
    'Traffic_Level': df['Traffic_Level'].mode()[0],
    'Time_of_Day': df['Time_of_Day'].mode()[0],
    'Courier_Experience_yrs': df['Courier_Experience_yrs'].median()
}, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()


In [None]:
df.dtypes

In [None]:
numerical_cols = [ 'Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min']
df_numeric = df[numerical_cols]
corr_matrix = df_numeric.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Heatmap de corrélation des variables numériques")
plt.show()


In [None]:
df.select_dtypes(include=['object']).columns


In [None]:
categorical_cols=['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type']
for clm in categorical_cols:
    sns.countplot(x=clm, data=df)
    plt.title(f"Distribution de {clm}")
    plt.show()

In [None]:
categorical_cols=['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type']
for clm in categorical_cols:
   sns.boxplot(x=clm,y='Delivery_Time_min', data=df)
   plt.title(f"Vitesse selon {clm}")
   plt.show()

In [None]:
print(df.columns.tolist())


In [None]:
from sklearn.preprocessing import OneHotEncoder

c_cols=['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  
encoded = encoder.fit_transform(df[c_cols]) 
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(c_cols))

df = pd.concat([df, encoded_df], axis=1) 
df = df.drop(columns=c_cols)  
print(encoded_df)

In [None]:
from sklearn.preprocessing import StandardScaler
f_s= ['Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs']
scaler = StandardScaler()
df[f_s] = scaler.fit_transform(df[f_s])
print(df)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
X = df.drop('Delivery_Time_min', axis=1)  
y = df['Delivery_Time_min']  
selector = SelectKBest(score_func=f_regression, k=5)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print( selected_features.tolist())

In [None]:
df.select_dtypes(include=['object']).columns

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score


rf = RandomForestRegressor(random_state=42)
svr = SVR()


param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

param_grid_svr = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_absolute_error', n_jobs=1)
grid_svr = GridSearchCV(svr, param_grid_svr, cv=5, scoring='neg_mean_absolute_error', n_jobs=1)

grid_rf.fit(X_train, y_train)
grid_svr.fit(X_train, y_train)

y_pred_rf = grid_rf.best_estimator_.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)


y_pred_svr = grid_svr.best_estimator_.predict(X_test)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("Random Forest - MAE:", mae_rf, "R2:", r2_rf)
print("SVR - MAE:", mae_svr, "R2:", r2_svr)
