In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor


In [3]:
incidents = pd.read_csv(r"LFB_Incident_data_Last_3_years.csv")
mobilisation = pd.read_csv(r"LFB_Mobilisation_data_Last_3_years.csv")

## Dataset "Incidents"

In [4]:
incid_cols = ['CalYear', 'HourOfCall', 'SpecialServiceType', 'AddressQualifier', 'Postcode_full',
             'UPRN', 'USRN','IncGeo_BoroughName', 'IncGeo_WardName', 'IncGeo_WardNameNew', 'ProperCase', 'Easting_m', 'Northing_m', 'Latitude',
               'Longitude', 'FRS','SecondPumpArriving_AttendanceTime',
                 'SecondPumpArriving_DeployedFromStation', 'Notional Cost (£)']

In [5]:
df_incid = incidents.drop(columns=incid_cols, axis=1)
df_incid.dropna(subset=['FirstPumpArriving_AttendanceTime', 'IncGeo_WardCode', 'IncidentStationGround', 'NumCalls'], inplace=True)

In [6]:
# Remplacer les 11 NaN de DeployedFromStation par les valeurs respectives de IncidentStationGround :
df_incid['FirstPumpArriving_DeployedFromStation'] = df_incid['FirstPumpArriving_DeployedFromStation'].fillna(df_incid['IncidentStationGround'])

In [7]:
df_incid['TimeOfCall'] = df_incid['DateOfCall'] + ' ' + df_incid['TimeOfCall']
df_incid.drop(['DateOfCall'], axis=1, inplace=True)
df_incid['TimeOfCall'] = pd.to_datetime(df_incid['TimeOfCall'], format='%d %b %Y %H:%M:%S')

## Dataset "Mobilisation"

In [8]:
df_mobil = mobilisation[['IncidentNumber', 'DateAndTimeMobilised','AttendanceTimeSeconds',
                     'DeployedFromStation_Name','DeployedFromLocation', 'PumpOrder', 'DelayCodeId']]

In [9]:
# Remplacer les DelaiCodeId manquant par 1, supposant que le code 1 = pas de ralentissement.
df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].fillna(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].fillna(1)


In [10]:
df_mobil = df_mobil.dropna()

In [11]:
df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].astype('int')

## Merge des deux Dataset

In [12]:
df = pd.merge(left=df_mobil, right=df_incid, on='IncidentNumber', how='left')

In [13]:
df['Year'] = df['TimeOfCall'].dt.year
df['DayOfWeek'] = df['TimeOfCall'].dt.dayofweek
df['Hour'] = df['TimeOfCall'].dt.hour

In [14]:
df.drop(columns=['IncidentNumber','TimeOfCall', 'DateAndTimeMobilised', 'DeployedFromStation_Name',
 'Postcode_district', 'PropertyType', 'NumCalls',
                  'FirstPumpArriving_AttendanceTime', 'FirstPumpArriving_DeployedFromStation'], axis = 1, inplace=True)

In [15]:
df = df.dropna()

## Get Dummies

In [16]:
dummies_cols = ['IncidentGroup','StopCodeDescription', 'PropertyCategory', 'DeployedFromLocation']

In [17]:
df = pd.get_dummies(df, columns=dummies_cols)

In [18]:
num_cols = ['NumStationsWithPumpsAttending', 'NumPumpsAttending', 'DelayCodeId']
df[num_cols] = df[num_cols].astype(int)

In [19]:
string_cols = ['IncGeo_BoroughCode', 'IncGeo_WardCode', 'IncidentStationGround']

df[string_cols] = df[string_cols].astype(str)

## Label Encoder

In [20]:
label_encoder = LabelEncoder()
df_encoded = df[string_cols].apply(label_encoder.fit_transform)

In [21]:
df.drop(string_cols, axis=1, inplace=True)

In [22]:
df = df.join(df_encoded)

# **Entrainement**

In [23]:
X = df.drop('AttendanceTimeSeconds', axis=1)
y = df['AttendanceTimeSeconds']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Standardisation

In [25]:
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Random Forest

In [26]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)  # n_estimators est le nombre d'arbres dans la forêt
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)

KeyboardInterrupt: 

Feature Selection

In [28]:
from sklearn.feature_selection import RFECV #Recursive Feature Elimination with Cross-Validation

# Initialisation du modèle Random Forest
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialisation du RFECV
rfecv = RFECV(estimator=model_rf, step=1, cv=3)  # cv est le nombre de folds de la validation croisée

# Entraînement du RFECV
rfecv.fit(X_train, y_train)

# Afficher les résultats
print("Nombre optimal de fonctionnalités : %d" % rfecv.n_features_)
print("Fonctionnalités sélectionnées : %s" % X_train.columns[rfecv.support_])
print("Rang des fonctionnalités : %s" % rfecv.ranking_)
print("-"*20)

# Entraînement du modèle final sur les fonctionnalités sélectionnées
selected_features_X_train = X_train[X_train.columns[rfecv.support_]]
selected_features_X_test = X_test[X_test.columns[rfecv.support_]]
model_rf.fit(selected_features_X_train, y_train)


# Prédiction sur l'ensemble de test
y_pred = model_rf.predict(selected_features_X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)
