## 00 - importation librairies et dataframe

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from utils import dataframe_info, racine_projet

In [2]:
racine_projet()

'/Users/kfranceschi/GitHub/mai24_bds_pompiers'

In [4]:
df = pd.read_csv(racine_projet()+'/data/raw/merged_data.csv', low_memory = False)

## 01 - data cleaning pre-viz

#### Création de la Target : ResponseDuration

In [5]:
df['DateAndTimeArrived'] = pd.to_datetime(df['DateAndTimeArrived'])
df['DateAndTimeMobilised'] = pd.to_datetime(df['DateAndTimeMobilised'])

df['ResponseDuration'] = df['DateAndTimeArrived'] - df['DateAndTimeMobilised']
df['ResponseDuration'] = df['ResponseDuration'].abs().dt.total_seconds().astype(int)

#### Création de la colonne DateAndTimeCalled pour supprimer les colonnes :
DateOfCall / CalYear / TimeOfCall / HourOfCall 

In [6]:
df['DateAndTimeCalled']= df['DateOfCall']+ [' '] + df['TimeOfCall']

#### Création de la colonne IncidentType


In [7]:
# Créer la nouvelle variable 'IncidentType'
df['IncidentType'] = df['StopCodeDescription']

# Remplacer les valeurs 'special Service' par le contenu de 'SpecialCodeDescription'
df.loc[df['StopCodeDescription'] == 'Special Service', 'IncidentType'] = df['SpecialServiceType']

#### Suppression de variables

In [8]:
#Suppresion pour cause d'inutilité / variables administratives
variables_drop0 =['IncidentNumber',
                  'ResourceMobilisationId',
                  'Resource_Code',
                  'FRS']
df_dropped = df.drop(labels = variables_drop0, axis = 1)

In [9]:
#Suppresion pour cause de doublon/ redondance par codification
variables_drop1 =['ProperCase',
                  'DeployedFromStation_Code',
                  'PlusCode_Code',
                  'DelayCodeId',
                  'ProperCase',
                  'PerformanceReporting',
                  'IncGeo_WardCode',
                  'IncGeo_BoroughCode']
df_dropped = df_dropped.drop(labels = variables_drop1, axis = 1)

In [10]:
# Suppression pour cause de %NAN trop élevé
variables_drop2 =['DelayCode_Description',
                  'DateAndTimeReturned',
                  'Postcode_full',
                  'Easting_m',
                  'Northing_m',
                  'Latitude',
                  'Longitude']
df_dropped = df_dropped.drop(labels = variables_drop2, axis = 1)

In [11]:
#Suppresion pour cause de colinéarité (valeurs identique par le calcul)
variables_drop3 =['DateOfCall',
                  'TimeOfCall',
                  'TurnoutTimeSeconds',
                  'TravelTimeSeconds',
                  'CalYear',
                  'HourOfCall']
df_dropped = df_dropped.drop(labels = variables_drop3, axis = 1)

In [12]:
df = df_dropped.copy()

## 02 - ajout des data météo

In [13]:
weather = pd.read_csv(racine_projet()+'/data/external/weather.csv', low_memory = False)

In [14]:
dataframe_info(weather)

Unnamed: 0,Column,Non-Null Count,NaN Count,NaN Percentage,Dtype,Example Value
0,date,135096,0,0.0,object,2009-01-01 00:00:00+00:00
1,temperature_2m,135096,0,0.0,float64,-0.3
2,relative_humidity_2m,135096,0,0.0,float64,96.07175
3,rain,135096,0,0.0,float64,0.0
4,weather_code,135096,0,0.0,float64,2.0
5,wind_speed_10m,135096,0,0.0,float64,6.489992
6,wind_gusts_10m,135096,0,0.0,float64,12.959999


In [15]:
df['DateAndTimeMobilised'] = pd.to_datetime(df['DateAndTimeMobilised'])
weather['date'] = pd.to_datetime(weather['date'])

# Troncature des colonnes datetime à l'heure
df['datetime_hour'] = df['DateAndTimeMobilised'].dt.floor('H')
weather['datetime_hour'] = weather['date'].dt.floor('H')

# S'assurer que les colonnes datetime_hour sont en UTC
df['datetime_hour'] = df['datetime_hour'].dt.tz_localize(None)  # Localize to None if not already UTC
weather['datetime_hour'] = weather['datetime_hour'].dt.tz_localize(None)

# Fusion des deux DataFrames en utilisant un left join
merged_df = pd.merge(df, weather, on='datetime_hour', how='left', suffixes=('_df1', '_df2'))

# Suppression des colonnes datetime_hour après la fusion
merged_df.drop(columns=['datetime_hour'], inplace=True)

  df['datetime_hour'] = df['DateAndTimeMobilised'].dt.floor('H')
  weather['datetime_hour'] = weather['date'].dt.floor('H')


In [16]:
dataframe_info(merged_df)

Unnamed: 0,Column,Non-Null Count,NaN Count,NaN Percentage,Dtype,Example Value
0,DateAndTimeMobilised,2365940,0,0.0,datetime64[ns],2009-01-01 00:02:27
1,DateAndTimeMobile,2338168,27772,1.17,object,2009-01-01 00:06:40
2,DateAndTimeArrived,2365940,0,0.0,datetime64[ns],2009-01-01 00:07:46
3,AttendanceTimeSeconds,2365940,0,0.0,int64,319
4,DateAndTimeLeft,2319613,46327,1.96,object,2009-01-01 00:16:46
5,DeployedFromStation_Name,2365917,23,0.0,object,Battersea
6,DeployedFromLocation,2364778,1162,0.05,object,Home Station
7,PumpOrder,2365940,0,0.0,int64,1
8,PlusCode_Description,2365940,0,0.0,object,Initial Mobilisation
9,IncidentGroup,2365940,0,0.0,object,Special Service


In [17]:
df = merged_df

## 02 bis - enregistrement dataviz (optionnel)

In [18]:
# Export du nouveau tableau sous format csv
df.to_csv(racine_projet()+'/data/processed/dataviz.csv', index=False)

## 03 - feature Engineering (après le dataviz)

In [19]:
df['DateAndTimeMobilised'] = pd.to_datetime(df['DateAndTimeMobilised'])

# Créer les colonnes 'year', 'month', 'weekday' et 'hour'
df['year'] = df['DateAndTimeMobilised'].dt.year
df['month'] = df['DateAndTimeMobilised'].dt.month
df['weekday'] = df['DateAndTimeMobilised'].dt.weekday  # Monday=0, Sunday=6
df['hour'] = df['DateAndTimeMobilised'].dt.hour

In [20]:
#Suppresion par inadaptabilité suite à DataViz
variables_dropfeats =['IncidentStationGround',
                      'FirstPumpArriving_DeployedFromStation',
                      'SecondPumpArriving_DeployedFromStation',
                      'DateAndTimeCalled',
                      'DateAndTimeMobilised',
                      'DateAndTimeMobile',
                      'DateAndTimeArrived',
                      'DateAndTimeLeft',
                      'AttendanceTimeSeconds',
                      'FirstPumpArriving_AttendanceTime',
                      'SecondPumpArriving_AttendanceTime',
                      'Postcode_district',
                      'IncGeo_WardName',
                      'IncGeo_WardNameNew',
                      'IncGeo_BoroughName',
                      'UPRN',
                      'USRN',
                      'IncidentGroup',
                      'StopCodeDescription',
                      'SpecialServiceType',
                      'PropertyType',
                      'PumpCount',
                      'PumpMinutesRounded',
                      'Notional Cost (£)',
                      'date'
                     ]
df_feats = df.drop(labels = variables_dropfeats, axis = 1)

In [21]:
dataframe_info(df_feats)

Unnamed: 0,Column,Non-Null Count,NaN Count,NaN Percentage,Dtype,Example Value
0,DeployedFromStation_Name,2365917,23,0.0,object,Battersea
1,DeployedFromLocation,2364778,1162,0.05,object,Home Station
2,PumpOrder,2365940,0,0.0,int64,1
3,PlusCode_Description,2365940,0,0.0,object,Initial Mobilisation
4,PropertyCategory,2365940,0,0.0,object,Road Vehicle
5,AddressQualifier,2365940,0,0.0,object,In street close to gazetteer location
6,Easting_rounded,2365940,0,0.0,int64,528650
7,Northing_rounded,2365940,0,0.0,int64,176850
8,NumStationsWithPumpsAttending,2365940,0,0.0,float64,2.0
9,NumPumpsAttending,2365940,0,0.0,float64,2.0


## 04 - traitement des NaN

In [22]:
df_cleaned = df_feats.copy()

In [23]:
# Suppression des lignes contenant des NaNs dans les features restantes : 'NumCalls', 'DeployedFromLocation','DeployedFromStation_Name'
#df_cleaned = df_cleaned.dropna(subset=['NumCalls'])
#df_cleaned = df_cleaned.dropna(subset=['DeployedFromLocation'])
#df_cleaned = df_cleaned.dropna(subset=['DeployedFromStation_Name'])
df_cleaned = df_cleaned.dropna()

In [24]:
dataframe_info(df_cleaned)

Unnamed: 0,Column,Non-Null Count,NaN Count,NaN Percentage,Dtype,Example Value
0,DeployedFromStation_Name,2363411,0,0.0,object,Battersea
1,DeployedFromLocation,2363411,0,0.0,object,Home Station
2,PumpOrder,2363411,0,0.0,int64,1
3,PlusCode_Description,2363411,0,0.0,object,Initial Mobilisation
4,PropertyCategory,2363411,0,0.0,object,Road Vehicle
5,AddressQualifier,2363411,0,0.0,object,In street close to gazetteer location
6,Easting_rounded,2363411,0,0.0,int64,528650
7,Northing_rounded,2363411,0,0.0,int64,176850
8,NumStationsWithPumpsAttending,2363411,0,0.0,float64,2.0
9,NumPumpsAttending,2363411,0,0.0,float64,2.0


## 05 - cyclisation des variables cycliques (month/week/hours)

In [25]:
# Normalisation des variables numériques cycliques
df_norm = df_cleaned.copy()

# Utilisez .loc pour faire des assignations
df_norm['hour_sin'] = np.sin(2 * np.pi * df_norm['hour'] / 24)
df_norm['hour_cos'] = np.cos(2 * np.pi * df_norm['hour'] / 24)

df_norm['weekday_sin'] = np.sin(2 * np.pi * df_norm['weekday'] / 7)
df_norm['weekday_cos'] = np.cos(2 * np.pi * df_norm['weekday'] / 7)

df_norm.loc[:, 'month_sin'] = np.sin(2 * np.pi * df_norm['month'] / 12)
df_norm.loc[:, 'month_cos'] = np.cos(2 * np.pi * df_norm['month'] / 12)

In [26]:
df_norm = df_norm.drop(columns = ['month','weekday','hour'])

In [28]:
dataframe_info(df_norm)
dataframe_info(df_norm).to_csv(racine_projet()+'/data/processed/ML_data_info.csv', index=False)

## 06 - export

In [37]:
# Export du nouveau tableau sous format csv
df_norm.to_csv(racine_projet()+'/data/processed/ML_data.csv', index=False)