# Projet MLOps Pompiers

In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor


In [60]:
incidents = pd.read_csv(r"LFB_Incident_data_Last_3_years.csv")
mobilisation = pd.read_csv(r"LFB_Mobilisation_data_Last_3_years.csv")

## Exploration & Pre-processing

Dataset "Incidents"

In [61]:
incidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392806 entries, 0 to 392805
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   IncidentNumber                          392806 non-null  object 
 1   DateOfCall                              392806 non-null  object 
 2   CalYear                                 392806 non-null  int64  
 3   TimeOfCall                              392806 non-null  object 
 4   HourOfCall                              392806 non-null  int64  
 5   IncidentGroup                           392806 non-null  object 
 6   StopCodeDescription                     392806 non-null  object 
 7   SpecialServiceType                      140955 non-null  object 
 8   PropertyCategory                        392806 non-null  object 
 9   PropertyType                            392806 non-null  object 
 10  AddressQualifier                        3928

In [62]:
incidents.isna().sum()

IncidentNumber                                 0
DateOfCall                                     0
CalYear                                        0
TimeOfCall                                     0
HourOfCall                                     0
IncidentGroup                                  0
StopCodeDescription                            0
SpecialServiceType                        251851
PropertyCategory                               0
PropertyType                                   0
AddressQualifier                               0
Postcode_full                             221750
Postcode_district                              0
UPRN                                           0
USRN                                           0
IncGeo_BoroughCode                             0
IncGeo_BoroughName                             0
ProperCase                                     0
IncGeo_WardCode                               29
IncGeo_WardName                               29
IncGeo_WardNameNew  

In [63]:
incidents.duplicated().sum()

0

In [64]:
#Liste des colonnes à supprimer du dataframe incidents :
incid_cols = ['CalYear', 'HourOfCall', 'SpecialServiceType', 'AddressQualifier', 'Postcode_full',
             'UPRN', 'USRN','IncGeo_BoroughName', 'IncGeo_WardName', 'IncGeo_WardNameNew', 'ProperCase', 'Easting_m', 'Northing_m', 'Latitude',
               'Longitude', 'FRS','SecondPumpArriving_AttendanceTime',
                 'SecondPumpArriving_DeployedFromStation', 'Notional Cost (£)']

In [65]:
df_incid = incidents.drop(columns=incid_cols, axis=1)

In [66]:
df_incid.dropna(subset=['FirstPumpArriving_AttendanceTime', 'IncGeo_WardCode', 'IncidentStationGround', 'NumCalls'], inplace=True)

In [67]:
df_incid.isna().sum()

IncidentNumber                            0
DateOfCall                                0
TimeOfCall                                0
IncidentGroup                             0
StopCodeDescription                       0
PropertyCategory                          0
PropertyType                              0
Postcode_district                         0
IncGeo_BoroughCode                        0
IncGeo_WardCode                           0
Easting_rounded                           0
Northing_rounded                          0
IncidentStationGround                     0
FirstPumpArriving_AttendanceTime          0
FirstPumpArriving_DeployedFromStation     5
NumStationsWithPumpsAttending             0
NumPumpsAttending                         0
PumpCount                                 0
PumpHoursRoundUp                         90
NumCalls                                  0
dtype: int64

In [68]:
# Remplacer les 11 NaN de DeployedFromStation par les valeurs respectives de IncidentStationGround :
df_incid['FirstPumpArriving_DeployedFromStation'] = df_incid['FirstPumpArriving_DeployedFromStation'].fillna(df_incid['IncidentStationGround'])
df_incid.isna().sum()

IncidentNumber                            0
DateOfCall                                0
TimeOfCall                                0
IncidentGroup                             0
StopCodeDescription                       0
PropertyCategory                          0
PropertyType                              0
Postcode_district                         0
IncGeo_BoroughCode                        0
IncGeo_WardCode                           0
Easting_rounded                           0
Northing_rounded                          0
IncidentStationGround                     0
FirstPumpArriving_AttendanceTime          0
FirstPumpArriving_DeployedFromStation     0
NumStationsWithPumpsAttending             0
NumPumpsAttending                         0
PumpCount                                 0
PumpHoursRoundUp                         90
NumCalls                                  0
dtype: int64

In [69]:
df_incid.shape

(369802, 20)

In [70]:
df_incid['TimeOfCall'] = df_incid['DateOfCall'] + ' ' + df_incid['TimeOfCall']
df_incid.drop(['DateOfCall'], axis=1, inplace=True)
df_incid['TimeOfCall'] = pd.to_datetime(df_incid['TimeOfCall'], format='%d %b %Y %H:%M:%S')

In [71]:
df_incid['TimeOfCall'].head()

1   2020-01-01 00:06:30
3   2020-01-01 00:11:05
4   2020-01-01 00:11:16
5   2020-01-01 00:12:02
6   2020-01-01 00:12:52
Name: TimeOfCall, dtype: datetime64[ns]

In [72]:
df_incid.dtypes

IncidentNumber                                   object
TimeOfCall                               datetime64[ns]
IncidentGroup                                    object
StopCodeDescription                              object
PropertyCategory                                 object
PropertyType                                     object
Postcode_district                                object
IncGeo_BoroughCode                               object
IncGeo_WardCode                                  object
Easting_rounded                                   int64
Northing_rounded                                  int64
IncidentStationGround                            object
FirstPumpArriving_AttendanceTime                float64
FirstPumpArriving_DeployedFromStation            object
NumStationsWithPumpsAttending                   float64
NumPumpsAttending                               float64
PumpCount                                       float64
PumpHoursRoundUp                                

## Dataset "Mobilisation"

In [73]:
mobilisation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576892 entries, 0 to 576891
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   IncidentNumber            576892 non-null  object 
 1   CalYear                   576892 non-null  int64  
 2   HourOfCall                576892 non-null  int64  
 3   ResourceMobilisationId    576892 non-null  int64  
 4   Resource_Code             576892 non-null  object 
 5   PerformanceReporting      576892 non-null  object 
 6   DateAndTimeMobilised      576892 non-null  object 
 7   DateAndTimeMobile         574442 non-null  object 
 8   DateAndTimeArrived        576892 non-null  object 
 9   TurnoutTimeSeconds        574430 non-null  float64
 10  TravelTimeSeconds         574426 non-null  float64
 11  AttendanceTimeSeconds     576892 non-null  int64  
 12  DateAndTimeLeft           576645 non-null  object 
 13  DateAndTimeReturned       0 non-null       f

In [74]:
mobilisation.isna().sum()

IncidentNumber                   0
CalYear                          0
HourOfCall                       0
ResourceMobilisationId           0
Resource_Code                    0
PerformanceReporting             0
DateAndTimeMobilised             0
DateAndTimeMobile             2450
DateAndTimeArrived               0
TurnoutTimeSeconds            2462
TravelTimeSeconds             2466
AttendanceTimeSeconds            0
DateAndTimeLeft                247
DateAndTimeReturned         576892
DeployedFromStation_Code         9
DeployedFromStation_Name         9
DeployedFromLocation           405
PumpOrder                        0
PlusCode_Code                    0
PlusCode_Description             0
DelayCodeId                 444255
DelayCode_Description       444255
dtype: int64

In [75]:
mobilisation.duplicated().sum()

0

Conserver uniquement les 1ères forces arrivées sur site

In [77]:
df_mobil = mobilisation[['IncidentNumber', 'DateAndTimeMobilised','AttendanceTimeSeconds',
                     'DeployedFromStation_Name','DeployedFromLocation', 'PumpOrder', 'DelayCodeId']]

In [78]:
df_mobil['DelayCodeId'].value_counts()

DelayCodeId
12.0    81936
9.0     26318
8.0      9895
5.0      6732
7.0      2024
11.0     1790
13.0     1406
10.0     1334
6.0       715
3.0       487
Name: count, dtype: int64

In [79]:
# Remplacer les DelaiCodeId manquant par 1, supposant que le code 1 = pas de ralentissement.
df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].fillna(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].fillna(1)


In [80]:
df_mobil.isna().sum()

IncidentNumber                0
DateAndTimeMobilised          0
AttendanceTimeSeconds         0
DeployedFromStation_Name      9
DeployedFromLocation        405
PumpOrder                     0
DelayCodeId                   0
dtype: int64

In [81]:
df_mobil = df_mobil.dropna()

In [82]:
df_mobil.dtypes

IncidentNumber               object
DateAndTimeMobilised         object
AttendanceTimeSeconds         int64
DeployedFromStation_Name     object
DeployedFromLocation         object
PumpOrder                     int64
DelayCodeId                 float64
dtype: object

In [83]:
df_mobil['DeployedFromLocation'].value_counts()

DeployedFromLocation
Home Station     556835
Other Station     19650
Name: count, dtype: int64

In [84]:
df_mobil['DelayCodeId'] = df_mobil['DelayCodeId'].astype('int')

In [85]:
df_mobil.shape

(576485, 7)

## Merge des deux Dataset

In [86]:
df = pd.merge(left=df_mobil, right=df_incid, on='IncidentNumber', how='left')

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576485 entries, 0 to 576484
Data columns (total 25 columns):
 #   Column                                 Non-Null Count   Dtype         
---  ------                                 --------------   -----         
 0   IncidentNumber                         576485 non-null  object        
 1   DateAndTimeMobilised                   576485 non-null  object        
 2   AttendanceTimeSeconds                  576485 non-null  int64         
 3   DeployedFromStation_Name               576485 non-null  object        
 4   DeployedFromLocation                   576485 non-null  object        
 5   PumpOrder                              576485 non-null  int64         
 6   DelayCodeId                            576485 non-null  int32         
 7   TimeOfCall                             574039 non-null  datetime64[ns]
 8   IncidentGroup                          574039 non-null  object        
 9   StopCodeDescription                    574039 no

In [88]:
df['Year'] = df['TimeOfCall'].dt.year
df['DayOfWeek'] = df['TimeOfCall'].dt.dayofweek
df['Hour'] = df['TimeOfCall'].dt.hour

In [89]:
df.drop(columns=['IncidentNumber','TimeOfCall', 'DateAndTimeMobilised', 'DeployedFromStation_Name',
 'Postcode_district', 'PropertyType', 'NumCalls',
                  'FirstPumpArriving_AttendanceTime', 'FirstPumpArriving_DeployedFromStation'], axis = 1, inplace=True)

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576485 entries, 0 to 576484
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   AttendanceTimeSeconds          576485 non-null  int64  
 1   DeployedFromLocation           576485 non-null  object 
 2   PumpOrder                      576485 non-null  int64  
 3   DelayCodeId                    576485 non-null  int32  
 4   IncidentGroup                  574039 non-null  object 
 5   StopCodeDescription            574039 non-null  object 
 6   PropertyCategory               574039 non-null  object 
 7   IncGeo_BoroughCode             574039 non-null  object 
 8   IncGeo_WardCode                574039 non-null  object 
 9   Easting_rounded                574039 non-null  float64
 10  Northing_rounded               574039 non-null  float64
 11  IncidentStationGround          574039 non-null  object 
 12  NumStationsWithPumpsAttending 

In [91]:
df.head()

Unnamed: 0,AttendanceTimeSeconds,DeployedFromLocation,PumpOrder,DelayCodeId,IncidentGroup,StopCodeDescription,PropertyCategory,IncGeo_BoroughCode,IncGeo_WardCode,Easting_rounded,Northing_rounded,IncidentStationGround,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Year,DayOfWeek,Hour
0,324,Home Station,1,1,Fire,Primary Fire,Non Residential,E09000017,E05013566,506750.0,181550.0,Hillingdon,1.0,2.0,2.0,1.0,2020.0,2.0,0.0
1,195,Home Station,1,1,False Alarm,AFA,Non Residential,E09000027,E05013786,518250.0,174850.0,Richmond,1.0,1.0,1.0,1.0,2020.0,2.0,0.0
2,426,Home Station,1,9,Fire,Secondary Fire,Outdoor Structure,E09000010,E05013685,535350.0,194250.0,Edmonton,1.0,1.0,1.0,1.0,2020.0,2.0,0.0
3,292,Home Station,2,1,Fire,Primary Fire,Non Residential,E09000017,E05013566,506750.0,181550.0,Hillingdon,1.0,2.0,2.0,1.0,2020.0,2.0,0.0
4,249,Home Station,1,1,False Alarm,AFA,Other Residential,E09000032,E05014024,527750.0,171150.0,Tooting,1.0,2.0,2.0,2.0,2020.0,2.0,0.0


In [92]:
df.isna().sum()

AttendanceTimeSeconds               0
DeployedFromLocation                0
PumpOrder                           0
DelayCodeId                         0
IncidentGroup                    2446
StopCodeDescription              2446
PropertyCategory                 2446
IncGeo_BoroughCode               2446
IncGeo_WardCode                  2446
Easting_rounded                  2446
Northing_rounded                 2446
IncidentStationGround            2446
NumStationsWithPumpsAttending    2446
NumPumpsAttending                2446
PumpCount                        2446
PumpHoursRoundUp                 2568
Year                             2446
DayOfWeek                        2446
Hour                             2446
dtype: int64

In [93]:
df = df.dropna()

## Get Dummies

In [94]:
dummies_cols = ['IncidentGroup','StopCodeDescription', 'PropertyCategory', 'DeployedFromLocation']

In [95]:
df = pd.get_dummies(df, columns=dummies_cols)

In [96]:
num_cols = ['NumStationsWithPumpsAttending', 'NumPumpsAttending', 'DelayCodeId']
df[num_cols] = df[num_cols].astype(int)

In [97]:
string_cols = ['IncGeo_BoroughCode', 'IncGeo_WardCode', 'IncidentStationGround']

df[string_cols] = df[string_cols].astype(str)

## Label Encoder

In [98]:
label_encoder = LabelEncoder()
df_encoded = df[string_cols].apply(label_encoder.fit_transform)

In [99]:
df.drop(string_cols, axis=1, inplace=True)

In [100]:
df = df.join(df_encoded)

In [101]:
df.dtypes

AttendanceTimeSeconds                                   int64
PumpOrder                                               int64
DelayCodeId                                             int32
Easting_rounded                                       float64
Northing_rounded                                      float64
NumStationsWithPumpsAttending                           int32
NumPumpsAttending                                       int32
PumpCount                                             float64
PumpHoursRoundUp                                      float64
Year                                                  float64
DayOfWeek                                             float64
Hour                                                  float64
IncidentGroup_False Alarm                                bool
IncidentGroup_Fire                                       bool
IncidentGroup_Special Service                            bool
StopCodeDescription_AFA                                  bool
StopCode

## Correlation

In [102]:
correlation_matrix = df.corr()

In [103]:
correlation_matrix

Unnamed: 0,AttendanceTimeSeconds,PumpOrder,DelayCodeId,Easting_rounded,Northing_rounded,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Year,...,PropertyCategory_Other Residential,PropertyCategory_Outdoor,PropertyCategory_Outdoor Structure,PropertyCategory_Rail Vehicle,PropertyCategory_Road Vehicle,DeployedFromLocation_Home Station,DeployedFromLocation_Other Station,IncGeo_BoroughCode,IncGeo_WardCode,IncidentStationGround
AttendanceTimeSeconds,1.0,0.393634,0.523697,-0.023204,0.032766,0.246801,0.221587,0.172868,0.055324,0.039363,...,-0.018228,0.02981,-0.013609,0.001115,0.024847,-0.023103,0.023103,-0.068138,0.007188,0.005842
PumpOrder,0.393634,1.0,-0.181946,0.013335,-0.000748,0.650103,0.701534,0.510404,0.085351,-0.010161,...,0.033726,-0.093989,-0.066301,0.004479,-0.078222,0.001549,-0.001549,0.05101,-0.029112,-0.006982
DelayCodeId,0.523697,-0.181946,1.0,-0.030679,0.022052,-0.147173,-0.185899,-0.127564,-0.004945,0.046565,...,-0.032411,0.071445,0.01915,-0.005435,0.050198,-0.031577,0.031577,-0.09261,0.018546,0.011309
Easting_rounded,-0.023204,0.013335,-0.030679,1.0,0.041095,0.038411,0.017946,0.018092,0.00978,-0.005235,...,-0.005716,0.02711,0.023467,-0.00258,0.016298,-0.003585,0.003585,-0.007559,0.162743,-0.113412
Northing_rounded,0.032766,-0.000748,0.022052,0.041095,1.0,-0.009297,0.000992,0.004115,0.005592,-0.006719,...,-0.010227,0.001648,0.00915,0.003389,0.006097,0.001073,-0.001073,-0.143431,-0.157687,0.010332
NumStationsWithPumpsAttending,0.246801,0.650103,-0.147173,0.038411,-0.009297,1.0,0.926751,0.681908,0.126315,0.010732,...,0.03807,-0.11984,-0.083454,0.007498,-0.096781,-0.021916,0.021916,0.10816,-0.075388,-0.023485
NumPumpsAttending,0.221587,0.701534,-0.185899,0.017946,0.000992,0.926751,1.0,0.734701,0.134663,-0.016813,...,0.04607,-0.132102,-0.093375,0.007366,-0.104801,-0.001599,0.001599,0.069667,-0.040787,-0.010597
PumpCount,0.172868,0.510404,-0.127564,0.018092,0.004115,0.681908,0.734701,1.0,0.752101,-0.012228,...,0.023789,-0.08748,-0.065876,0.004963,-0.078161,-0.003259,0.003259,0.042526,-0.026562,-0.007703
PumpHoursRoundUp,0.055324,0.085351,-0.004945,0.00978,0.005592,0.126315,0.134663,0.752101,1.0,-0.003424,...,-0.015427,-0.001051,-0.005271,0.005651,-0.005452,-0.003224,0.003224,-0.007033,5e-06,-0.001138
Year,0.039363,-0.010161,0.046565,-0.005235,-0.006719,0.010732,-0.016813,-0.012228,-0.003424,1.0,...,-0.014931,-0.018474,-0.00684,0.002476,-0.003774,-0.018906,0.018906,-0.004035,-0.001365,-0.002072


# **Entrainement**

In [104]:
X = df.drop('AttendanceTimeSeconds', axis=1)
y = df['AttendanceTimeSeconds']

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Standardisation

In [106]:
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Linear Regression

In [107]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [108]:
y_pred = lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)
print("Mean Percentage Error (MPE): ", mpe)

print("\nScore :")
print('score train :', lr.score(X_train, y_train))
print('score test :', lr.score(X_test, y_test))


Mean Squared Error (MSE):  10188.463890577113
Mean Absolute Error (MAE):  71.62543580496846
R-squared (R²):  0.5352101668983593
Root Mean Squared Error (RMSE):  100.93792097411712
Mean Percentage Error (MPE):  -45.76427241325681

Score :
score train : 0.5360154750504511
score test : 0.5352101668983593


LightGBM

In [109]:
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train, y_train)

y_pred = model_lgb.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in the train set: 459133, number of used features: 35
[LightGBM] [Info] Start training from score 348.932305
Mean Squared Error (MSE):  7865.960856999302
Mean Absolute Error (MAE):  63.450155661738094
R-squared (R²):  0.6411609568258818
Root Mean Squared Error (RMSE):  88.69025232233417


Random Forest

In [110]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)  # n_estimators est le nombre d'arbres dans la forêt
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)


Mean Squared Error (MSE):  7462.234178561525
Mean Absolute Error (MAE):  58.33417916327049
R-squared (R²):  0.6595786552645369
Root Mean Squared Error (RMSE):  86.38422413011259


Le Random Forest est le modèle le plus performant, mais il est aussi plus long à entrainer (11min30sec). 

## Test prédictions

In [117]:
predictions = y_pred
y_true = y

# Générer un échantillon aléatoire pour l'affichage
indices = np.random.choice(len(predictions), size=3, replace=False)

# Afficher l'échantillon
for index in indices:
    print(f"Exemple {index + 1}:")
    print(f"Valeur prédite : {predictions[index]}")
    print(f"Valeur réelle : {y_true[index]}")
    print("-" * 20)

Exemple 77397:
Valeur prédite : 258.003229877379
Valeur réelle : 321
--------------------
Exemple 75759:
Valeur prédite : 341.4935891841936
Valeur réelle : 453
--------------------
Exemple 25698:
Valeur prédite : 436.9254036734475
Valeur réelle : 486
--------------------


## **Features Selection**

In [112]:
# Importance des fonctionnalités
feature_importances = model_lgb.feature_importances_

# Associer les importances aux noms de fonctionnalités
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Trier les importances des fonctionnalités par ordre croissant
sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse = True)

# Afficher les importances des fonctionnalités
for feature, importance in sorted_importances:
    print(f'Feature: {feature}, Importance: {importance}')

Feature: Northing_rounded, Importance: 567
Feature: Easting_rounded, Importance: 551
Feature: PumpOrder, Importance: 248
Feature: DelayCodeId, Importance: 222
Feature: NumStationsWithPumpsAttending, Importance: 218
Feature: IncGeo_WardCode, Importance: 202
Feature: Hour, Importance: 180
Feature: NumPumpsAttending, Importance: 146
Feature: PumpHoursRoundUp, Importance: 109
Feature: PumpCount, Importance: 86
Feature: IncidentStationGround, Importance: 85
Feature: IncGeo_BoroughCode, Importance: 77
Feature: DeployedFromLocation_Home Station, Importance: 48
Feature: DayOfWeek, Importance: 37
Feature: PropertyCategory_Road Vehicle, Importance: 37
Feature: Year, Importance: 32
Feature: IncidentGroup_Special Service, Importance: 26
Feature: PropertyCategory_Dwelling, Importance: 23
Feature: PropertyCategory_Outdoor, Importance: 21
Feature: PropertyCategory_Non Residential, Importance: 18
Feature: StopCodeDescription_AFA, Importance: 15
Feature: StopCodeDescription_False alarm - Good intent, I

In [113]:
important_features = [feature for feature, importance in feature_importance_dict.items() if importance > 50]
important_features

['PumpOrder',
 'DelayCodeId',
 'Easting_rounded',
 'Northing_rounded',
 'NumStationsWithPumpsAttending',
 'NumPumpsAttending',
 'PumpCount',
 'PumpHoursRoundUp',
 'Hour',
 'IncGeo_BoroughCode',
 'IncGeo_WardCode',
 'IncidentStationGround']

In [114]:
# Je réduit le nombre de colonnes en ne conservant que les features qui ont une importance supérieur à 50 :
df_reduced = df[['AttendanceTimeSeconds','PumpOrder',
 'DelayCodeId',
 'Easting_rounded',
 'Northing_rounded',
 'NumStationsWithPumpsAttending',
 'NumPumpsAttending',
 'PumpCount',
 'PumpHoursRoundUp',
 'Hour',
 'DeployedFromLocation_Home Station',
 'IncGeo_BoroughCode',
 'IncGeo_WardCode',
 'IncidentStationGround']]

In [115]:
X = df_reduced.drop('AttendanceTimeSeconds', axis=1)
y = df_reduced['AttendanceTimeSeconds']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model2_lgb = lgb.LGBMRegressor()
model2_lgb.fit(X_train, y_train)

y_pred = model2_lgb.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1106
[LightGBM] [Info] Number of data points in the train set: 459133, number of used features: 13
[LightGBM] [Info] Start training from score 349.010740
Mean Squared Error (MSE):  7788.755388303297
Mean Absolute Error (MAE):  63.15024187427161
R-squared (R²):  0.6415251614899142
Root Mean Squared Error (RMSE):  88.25392562545474


### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],  # Nombre d'arbres dans la forêt
    'max_depth': [None, 10, 20, 30],  # Profondeur maximale des arbres
}

In [None]:
model_rf = RandomForestRegressor(random_state=42, n_jobs = -1)  # n_estimators est le nombre d'arbres dans la forêt

grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mpe = np.mean((y_test - y_pred) / y_test) * 100

# Affichage des métriques
print("Mean Squared Error (MSE): ", mse)
print("Mean Absolute Error (MAE): ", mae)
print("R-squared (R²): ", r2)
print("Root Mean Squared Error (RMSE): ", rmse)

In [None]:
best_params