# Import librairies et modules

In [1]:

import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
dbname = os.getenv("DB_NAME")

DATABASE_URL = f"postgresql://{user}:{password}@{host}:{port}/{dbname}?sslmode=require"

def fetch_table_from_trafic_db(table_name) :
    '''
        Récupère toutes les données d'une table spécifique depuis la base de données de trafic.
        
        Args:
            table_name (str): Le nom de la table à récupérer.
            
        Returns:
            pd.DataFrame: Un DataFrame contenant les données de la table demandée.
    '''
    engine = create_engine(DATABASE_URL)
    query = f"SELECT * FROM {table_name};" 
    df = pd.read_sql(query, engine)
    return df


# Import df

In [2]:
df_events = fetch_table_from_trafic_db('events_data')
df_matching_troncon = fetch_table_from_trafic_db('matching_event_troncon')
df_pollution_data = fetch_table_from_trafic_db('pollution_data')
df_trafic = fetch_table_from_trafic_db('trafic_routier')
df_weather_data = fetch_table_from_trafic_db('weather_data')
df_holidays = fetch_table_from_trafic_db('weekday_holidays')

# Jointures

## Trafic + holidays

### Explo & clean up df_holidays

In [269]:
df_holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   dates        685 non-null    datetime64[ns]
 1   description  685 non-null    object        
 2   is_vacances  685 non-null    int64         
 3   is_ferie     685 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 21.5+ KB


In [270]:
df_holidays.head(2)

Unnamed: 0,dates,description,is_vacances,is_ferie
0,2025-02-15,Saturday,1,0
1,2025-02-16,Sunday,1,0


In [271]:
df_holidays['dates'] = pd.to_datetime(df_holidays['dates'])

In [272]:
df_holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   dates        685 non-null    datetime64[ns]
 1   description  685 non-null    object        
 2   is_vacances  685 non-null    int64         
 3   is_ferie     685 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 21.5+ KB


### Explo & clean_up df_trafic

In [273]:
df_trafic.head(2)

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,geo_point_2d,geometrie,shape_geo,horodatage,type_geo,coordinates_geo,horodatage_date
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,"{47.18871912163152,-1.5197157847601135}","{{-1.518929725628985,47.189704020892194},{-1.5...",LineString,2025-03-14 05:52:00,Point,"{-1.5197157847601135,47.18871912163152}",2025-03-14
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,"{47.18931582924613,-1.514539402955191}","{{-1.517706427104703,47.18953239803709},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.514539402955191,47.18931582924613}",2025-03-14


In [274]:
df_trafic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554761 entries, 0 to 1554760
Data columns (total 17 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id_technique       1554761 non-null  object        
 1   id                 1554761 non-null  int64         
 2   debit              1554761 non-null  int64         
 3   longueur           1554761 non-null  int64         
 4   taux_occupation    1554761 non-null  float64       
 5   code_couleur       1554761 non-null  int64         
 6   nom_du_troncon     1554761 non-null  object        
 7   etat_du_trafic     1554761 non-null  object        
 8   temps_de_parcours  1554761 non-null  int64         
 9   vitesse            1554761 non-null  int64         
 10  geo_point_2d       1554761 non-null  object        
 11  geometrie          1554761 non-null  object        
 12  shape_geo          1554761 non-null  object        
 13  horodatage         1554761 

In [275]:
df_trafic['horodatage_date'] = pd.to_datetime(df_trafic['horodatage_date'])

### Merge

In [276]:
df_trafic_holidays = pd.merge(df_trafic, df_holidays,
                              how = 'left',
                              left_on = df_trafic['horodatage_date'],
                              right_on = 'dates')

#### Check & Clean

In [277]:
df_trafic_holidays.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,geometrie,shape_geo,horodatage,type_geo,coordinates_geo,horodatage_date,dates,description,is_vacances,is_ferie
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,"{{-1.518929725628985,47.189704020892194},{-1.5...",LineString,2025-03-14 05:52:00,Point,"{-1.5197157847601135,47.18871912163152}",2025-03-14,2025-03-14,Friday,0,0
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,"{{-1.517706427104703,47.18953239803709},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.514539402955191,47.18931582924613}",2025-03-14,2025-03-14,Friday,0,0
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,"{{-1.517779299812194,47.18974561725744},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.515856745416331,47.189627278847524}",2025-03-14,2025-03-14,Friday,0,0
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,"{{-1.51193768710722,47.185509135525734},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.5098737581232031,47.18408144682667}",2025-03-14,2025-03-14,Friday,0,0
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,"{{-1.511785863458118,47.18567104989718},{-1.50...",LineString,2025-03-14 05:52:00,Point,"{-1.5095437456423284,47.18407167545127}",2025-03-14,2025-03-14,Friday,0,0


df_trafic_holidays.drop(columns=['dates'],inplace = True)

In [278]:
df_trafic_holidays.rename(columns={
    'description' : 'jour'
}, inplace = True)

In [279]:
df_trafic_holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554761 entries, 0 to 1554760
Data columns (total 21 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id_technique       1554761 non-null  object        
 1   id                 1554761 non-null  int64         
 2   debit              1554761 non-null  int64         
 3   longueur           1554761 non-null  int64         
 4   taux_occupation    1554761 non-null  float64       
 5   code_couleur       1554761 non-null  int64         
 6   nom_du_troncon     1554761 non-null  object        
 7   etat_du_trafic     1554761 non-null  object        
 8   temps_de_parcours  1554761 non-null  int64         
 9   vitesse            1554761 non-null  int64         
 10  geo_point_2d       1554761 non-null  object        
 11  geometrie          1554761 non-null  object        
 12  shape_geo          1554761 non-null  object        
 13  horodatage         1554761 

## Trafic + holidays + weather

#### Explo & clean_up weather table

In [280]:
df_weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            1344 non-null   datetime64[ns]
 1   temperature_2m  1344 non-null   float64       
 2   visibility      1344 non-null   float64       
 3   precipitation   1344 non-null   float64       
 4   wind_speed_10m  1344 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 52.6 KB


In [281]:
df_weather_data.head(5)

Unnamed: 0,date,temperature_2m,visibility,precipitation,wind_speed_10m
0,2025-02-15 22:00:00,3.9805,24140.0,0.0,10.144082
1,2025-02-15 23:00:00,3.2305,24140.0,0.0,11.019764
2,2025-02-16 00:00:00,2.8305,24140.0,0.0,11.885453
3,2025-02-16 01:00:00,2.2805,24140.0,0.0,11.019764
4,2025-02-16 02:00:00,1.8805,24140.0,0.0,10.495713


#### Explo & clean_up df_trafic_holidays

In [282]:
df_trafic_holidays.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,geometrie,shape_geo,horodatage,type_geo,coordinates_geo,horodatage_date,dates,jour,is_vacances,is_ferie
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,"{{-1.518929725628985,47.189704020892194},{-1.5...",LineString,2025-03-14 05:52:00,Point,"{-1.5197157847601135,47.18871912163152}",2025-03-14,2025-03-14,Friday,0,0
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,"{{-1.517706427104703,47.18953239803709},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.514539402955191,47.18931582924613}",2025-03-14,2025-03-14,Friday,0,0
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,"{{-1.517779299812194,47.18974561725744},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.515856745416331,47.189627278847524}",2025-03-14,2025-03-14,Friday,0,0
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,"{{-1.51193768710722,47.185509135525734},{-1.51...",LineString,2025-03-14 05:52:00,Point,"{-1.5098737581232031,47.18408144682667}",2025-03-14,2025-03-14,Friday,0,0
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,"{{-1.511785863458118,47.18567104989718},{-1.50...",LineString,2025-03-14 05:52:00,Point,"{-1.5095437456423284,47.18407167545127}",2025-03-14,2025-03-14,Friday,0,0


In [283]:
# Arrondir à l'heure la plus proche pour matcher avec le dataset weather
df_trafic_holidays['rounded_horodatage'] = df_trafic_holidays['horodatage'].dt.floor('H')

  df_trafic_holidays['rounded_horodatage'] = df_trafic_holidays['horodatage'].dt.floor('H')


In [284]:
df_trafic_holidays.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,shape_geo,horodatage,type_geo,coordinates_geo,horodatage_date,dates,jour,is_vacances,is_ferie,rounded_horodatage
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,LineString,2025-03-14 05:52:00,Point,"{-1.5197157847601135,47.18871912163152}",2025-03-14,2025-03-14,Friday,0,0,2025-03-14 05:00:00
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,LineString,2025-03-14 05:52:00,Point,"{-1.514539402955191,47.18931582924613}",2025-03-14,2025-03-14,Friday,0,0,2025-03-14 05:00:00
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,LineString,2025-03-14 05:52:00,Point,"{-1.515856745416331,47.189627278847524}",2025-03-14,2025-03-14,Friday,0,0,2025-03-14 05:00:00
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,LineString,2025-03-14 05:52:00,Point,"{-1.5098737581232031,47.18408144682667}",2025-03-14,2025-03-14,Friday,0,0,2025-03-14 05:00:00
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,LineString,2025-03-14 05:52:00,Point,"{-1.5095437456423284,47.18407167545127}",2025-03-14,2025-03-14,Friday,0,0,2025-03-14 05:00:00


#### Merge

In [285]:
df_trafic_holidays_weather = pd.merge(df_trafic_holidays,df_weather_data,
                                                      how = 'left',
                                                      left_on = 'rounded_horodatage',
                                                      right_on = 'date')

In [286]:
df_trafic_holidays_weather.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,dates,jour,is_vacances,is_ferie,rounded_horodatage,date,temperature_2m,visibility,precipitation,wind_speed_10m
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,2025-03-14,Friday,0,0,2025-03-14 05:00:00,2025-03-14 05:00:00,0.5805,24140.0,0.0,6.989935
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,2025-03-14,Friday,0,0,2025-03-14 05:00:00,2025-03-14 05:00:00,0.5805,24140.0,0.0,6.989935
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,2025-03-14,Friday,0,0,2025-03-14 05:00:00,2025-03-14 05:00:00,0.5805,24140.0,0.0,6.989935
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,2025-03-14,Friday,0,0,2025-03-14 05:00:00,2025-03-14 05:00:00,0.5805,24140.0,0.0,6.989935
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,2025-03-14,Friday,0,0,2025-03-14 05:00:00,2025-03-14 05:00:00,0.5805,24140.0,0.0,6.989935


In [287]:
df_trafic_holidays_weather.drop(columns=['date'],inplace = True)

In [288]:
df_trafic_holidays_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554761 entries, 0 to 1554760
Data columns (total 26 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id_technique        1554761 non-null  object        
 1   id                  1554761 non-null  int64         
 2   debit               1554761 non-null  int64         
 3   longueur            1554761 non-null  int64         
 4   taux_occupation     1554761 non-null  float64       
 5   code_couleur        1554761 non-null  int64         
 6   nom_du_troncon      1554761 non-null  object        
 7   etat_du_trafic      1554761 non-null  object        
 8   temps_de_parcours   1554761 non-null  int64         
 9   vitesse             1554761 non-null  int64         
 10  geo_point_2d        1554761 non-null  object        
 11  geometrie           1554761 non-null  object        
 12  shape_geo           1554761 non-null  object        
 13  horodatage  

## Trafic + holidays + weather + pollution

#### Explo & clean_up pollution

In [289]:
df_pollution_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   gml_id            51 non-null     object        
 1   date_ech          51 non-null     datetime64[ns]
 2   code_qual         51 non-null     int64         
 3   lib_qual          51 non-null     object        
 4   coul_qual         51 non-null     object        
 5   date_dif          51 non-null     object        
 6   source            51 non-null     object        
 7   type_zone         51 non-null     object        
 8   code_zone         51 non-null     int64         
 9   lib_zone          51 non-null     object        
 10  code_no2          51 non-null     int64         
 11  code_so2          51 non-null     int64         
 12  code_o3           51 non-null     int64         
 13  code_pm10         51 non-null     int64         
 14  code_pm25         51 non-nul

In [290]:
df_pollution_data['date_ech'] = pd.to_datetime(df_pollution_data['date_ech'])

In [291]:
df_pollution_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   gml_id            51 non-null     object        
 1   date_ech          51 non-null     datetime64[ns]
 2   code_qual         51 non-null     int64         
 3   lib_qual          51 non-null     object        
 4   coul_qual         51 non-null     object        
 5   date_dif          51 non-null     object        
 6   source            51 non-null     object        
 7   type_zone         51 non-null     object        
 8   code_zone         51 non-null     int64         
 9   lib_zone          51 non-null     object        
 10  code_no2          51 non-null     int64         
 11  code_so2          51 non-null     int64         
 12  code_o3           51 non-null     int64         
 13  code_pm10         51 non-null     int64         
 14  code_pm25         51 non-nul

In [292]:
df_pollution_data.head()

Unnamed: 0,gml_id,date_ech,code_qual,lib_qual,coul_qual,date_dif,source,type_zone,code_zone,lib_zone,...,x_wgs84,y_wgs84,x_reg,y_reg,epsg_reg,etat_indice,geom_type,geom_coordinates,geo_point_2d_lon,geo_point_2d_lat
0,ind_pays_de_la_loire.14923,2025-03-02,2,moyen,#50ccaa,2025-03-03,Air Pays de la Loire,commune,44109,Nantes,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
1,ind_pays_de_la_loire.14027,2025-03-04,3,dégradé,#f0e641,2025-03-05,Air Pays de la Loire,commune,44109,Nantes,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
2,ind_pays_de_la_loire.10884,2025-03-12,2,moyen,#50ccaa,2025-03-13,Air Pays de la Loire,commune,44109,Nantes,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
3,ind_pays_de_la_loire.2805,2025-04-01,2,moyen,#50ccaa,2025-04-02,Air Pays de la Loire,commune,44109,Nantes,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
4,ind_pays_de_la_loire.15733,2025-02-28,2,moyen,#50ccaa,2025-03-01,Air Pays de la Loire,commune,44109,Nantes,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044


In [293]:
df_trafic_holidays_weather_pollution = pd.merge(df_trafic_holidays_weather,df_pollution_data,
                                                how = 'left', 
                                                left_on = 'horodatage_date',
                                                right_on = 'date_ech')

In [294]:
df_trafic_holidays_weather_pollution.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,x_wgs84,y_wgs84,x_reg,y_reg,epsg_reg,etat_indice,geom_type,geom_coordinates,geo_point_2d_lon,geo_point_2d_lat
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044


In [295]:
df_trafic_holidays_weather_pollution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554761 entries, 0 to 1554760
Data columns (total 51 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id_technique        1554761 non-null  object        
 1   id                  1554761 non-null  int64         
 2   debit               1554761 non-null  int64         
 3   longueur            1554761 non-null  int64         
 4   taux_occupation     1554761 non-null  float64       
 5   code_couleur        1554761 non-null  int64         
 6   nom_du_troncon      1554761 non-null  object        
 7   etat_du_trafic      1554761 non-null  object        
 8   temps_de_parcours   1554761 non-null  int64         
 9   vitesse             1554761 non-null  int64         
 10  geo_point_2d        1554761 non-null  object        
 11  geometrie           1554761 non-null  object        
 12  shape_geo           1554761 non-null  object        
 13  horodatage  

## Trafic + holidays + weather + pollution + event

#### Explo & clean_up event

In [296]:
df_matching_troncon.head()

Unnamed: 0,index,id_du_troncon,nom_du_troncon,has_event_near_troncon,event_location_name,event_location_id
0,27,97,St Joseph I04,1,Parc des Expositions de la Beaujoire,34310089
1,171,5208,Viviani P2,1,Palais des Sports Beaulieu,57179132
2,251,98,St Joseph I05,1,Stade de la Beaujoire Louis Fonteneau,95545941
3,317,139,Bureau I,1,STEREOLUX,21547859
4,465,5207,Viviani P1,1,Palais des Sports Beaulieu,57179132


In [297]:
df_matching_troncon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   index                   19 non-null     int64 
 1   id_du_troncon           19 non-null     object
 2   nom_du_troncon          19 non-null     object
 3   has_event_near_troncon  19 non-null     int64 
 4   event_location_name     19 non-null     object
 5   event_location_id       19 non-null     object
dtypes: int64(2), object(4)
memory usage: 1.0+ KB


In [298]:
df_matching_troncon['id_du_troncon'] = df_matching_troncon['id_du_troncon'].astype('str')

In [299]:
df_matching_troncon.head()

Unnamed: 0,index,id_du_troncon,nom_du_troncon,has_event_near_troncon,event_location_name,event_location_id
0,27,97,St Joseph I04,1,Parc des Expositions de la Beaujoire,34310089
1,171,5208,Viviani P2,1,Palais des Sports Beaulieu,57179132
2,251,98,St Joseph I05,1,Stade de la Beaujoire Louis Fonteneau,95545941
3,317,139,Bureau I,1,STEREOLUX,21547859
4,465,5207,Viviani P1,1,Palais des Sports Beaulieu,57179132


In [300]:
df_events.head()

Unnamed: 0,name,description,startdate,enddate,location_name,address,city,postalcode,location_uid,coordinates_geo,event_id
0,Football : FC Nantes / Paris SG,Match - 29e journée - Ligue 1 - Saison 2024/2025,2025-04-13 17:00:00,2025-04-13 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",926afe1f-3bcc-4cc1-a2bd-3597dd36d86d
1,Football : FC Nantes / Toulouse FC,Match - 31e journée - Ligue 1 - Saison 2024/2025,2025-04-27 17:00:00,2025-04-27 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",19bb79da-1c20-41a2-a53d-c45bcae5a0ce
2,Blandine Lehout - La Vie de ta mère,Blandine Lehout - La Vie de ta mère,2025-10-01 20:00:00,2025-10-01 22:30:00,Cité internationale des Congrès,"5 Rue de Valmy, Nantes",Nantes,44000,47538434,"{-1.544058, 47.213314}",da51b672-08bc-49e6-bcd6-516090c48e77
3,Alexandre Kominek - Bâtard sensible,Alexandre Kominek - Bâtard sensible,2025-03-01 20:30:00,2025-03-01 22:00:00,Cité internationale des Congrès,"5 Rue de Valmy, Nantes",Nantes,44000,47538434,"{-1.544058, 47.213314}",209c09e0-8d45-4429-ab15-7e1bf979f9c1
4,Comédie Le Clan des divorcées,Comédie Le Clan des divorcées,2025-03-02 15:00:00,2025-03-02 16:40:00,Cité internationale des Congrès,"5 Rue de Valmy, Nantes",Nantes,44000,47538434,"{-1.544058, 47.213314}",2eae861f-2e7f-4ff4-8c77-8f72b719fb23


In [301]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   name             134 non-null    object        
 1   description      134 non-null    object        
 2   startdate        134 non-null    datetime64[ns]
 3   enddate          134 non-null    datetime64[ns]
 4   location_name    134 non-null    object        
 5   address          134 non-null    object        
 6   city             134 non-null    object        
 7   postalcode       134 non-null    object        
 8   location_uid     134 non-null    object        
 9   coordinates_geo  134 non-null    object        
 10  event_id         134 non-null    object        
dtypes: datetime64[ns](2), object(9)
memory usage: 11.6+ KB


In [302]:
df_events['location_uid'] = df_events['location_uid'].astype('str')

In [303]:
df_events['location_uid'] = df_events['location_uid'].astype('str')

In [304]:
df_event_troncon = pd.merge(df_events,df_matching_troncon,
                            how = 'left', 
                            left_on = 'location_uid',
                            right_on = 'event_location_id')

In [305]:
df_event_troncon.head()

Unnamed: 0,name,description,startdate,enddate,location_name,address,city,postalcode,location_uid,coordinates_geo,event_id,index,id_du_troncon,nom_du_troncon,has_event_near_troncon,event_location_name,event_location_id
0,Football : FC Nantes / Paris SG,Match - 29e journée - Ligue 1 - Saison 2024/2025,2025-04-13 17:00:00,2025-04-13 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",926afe1f-3bcc-4cc1-a2bd-3597dd36d86d,251.0,98,St Joseph I05,1.0,Stade de la Beaujoire Louis Fonteneau,95545941
1,Football : FC Nantes / Paris SG,Match - 29e journée - Ligue 1 - Saison 2024/2025,2025-04-13 17:00:00,2025-04-13 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",926afe1f-3bcc-4cc1-a2bd-3597dd36d86d,800.0,5098,St Joseph P09,1.0,Stade de la Beaujoire Louis Fonteneau,95545941
2,Football : FC Nantes / Paris SG,Match - 29e journée - Ligue 1 - Saison 2024/2025,2025-04-13 17:00:00,2025-04-13 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",926afe1f-3bcc-4cc1-a2bd-3597dd36d86d,801.0,5097,St Joseph P10,1.0,Stade de la Beaujoire Louis Fonteneau,95545941
3,Football : FC Nantes / Paris SG,Match - 29e journée - Ligue 1 - Saison 2024/2025,2025-04-13 17:00:00,2025-04-13 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",926afe1f-3bcc-4cc1-a2bd-3597dd36d86d,849.0,97,St Joseph I04,1.0,Stade de la Beaujoire Louis Fonteneau,95545941
4,Football : FC Nantes / Toulouse FC,Match - 31e journée - Ligue 1 - Saison 2024/2025,2025-04-27 17:00:00,2025-04-27 19:00:00,Stade de la Beaujoire Louis Fonteneau,"330 Route de Saint Joseph, Nantes",Nantes,44300,95545941,"{-1.527812, 47.25867}",19bb79da-1c20-41a2-a53d-c45bcae5a0ce,251.0,98,St Joseph I05,1.0,Stade de la Beaujoire Louis Fonteneau,95545941


In [306]:
df_event_troncon.drop(columns=['index', 'nom_du_troncon', 'event_location_name', 'event_location_id'], inplace = True)

#### Merge

In [307]:
df_trafic_holidays_weather_pollution.head()

Unnamed: 0,id_technique,id,debit,longueur,taux_occupation,code_couleur,nom_du_troncon,etat_du_trafic,temps_de_parcours,vitesse,...,x_wgs84,y_wgs84,x_reg,y_reg,epsg_reg,etat_indice,geom_type,geom_coordinates,geo_point_2d_lon,geo_point_2d_lat
0,1081-20250314T055200,1081,0,245,0.0,2,Bourdonnieres P1,Indéterminé,96,9,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
1,6126-20250314T055200,6126,0,482,0.0,2,Clisson P4,Indéterminé,94,18,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
2,1126-20250314T055200,1126,120,292,2.0,3,Clisson I4,Fluide,57,18,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
3,6120-20250314T055200,6120,60,450,0.8,3,Grand Maison P1,Fluide,132,12,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044
4,1120-20250314T055200,1120,0,496,0.0,2,Grand Maison I2,Indéterminé,83,22,...,-1.560706,47.233326,355136.6,6691409.5,2154,Consolidée,Feature,"[[[[-1.532422818, 47.295023199], [-1.527218409...",-1.548206,47.232044


In [308]:
df_trafic_holidays_weather_pollution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554761 entries, 0 to 1554760
Data columns (total 51 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id_technique        1554761 non-null  object        
 1   id                  1554761 non-null  int64         
 2   debit               1554761 non-null  int64         
 3   longueur            1554761 non-null  int64         
 4   taux_occupation     1554761 non-null  float64       
 5   code_couleur        1554761 non-null  int64         
 6   nom_du_troncon      1554761 non-null  object        
 7   etat_du_trafic      1554761 non-null  object        
 8   temps_de_parcours   1554761 non-null  int64         
 9   vitesse             1554761 non-null  int64         
 10  geo_point_2d        1554761 non-null  object        
 11  geometrie           1554761 non-null  object        
 12  shape_geo           1554761 non-null  object        
 13  horodatage  

In [309]:
df_trafic_holidays_weather_pollution['id'] = df_trafic_holidays_weather_pollution['id'].astype('str')

In [310]:
df_event_troncon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   name                    500 non-null    object        
 1   description             500 non-null    object        
 2   startdate               500 non-null    datetime64[ns]
 3   enddate                 500 non-null    datetime64[ns]
 4   location_name           500 non-null    object        
 5   address                 500 non-null    object        
 6   city                    500 non-null    object        
 7   postalcode              500 non-null    object        
 8   location_uid            500 non-null    object        
 9   coordinates_geo         500 non-null    object        
 10  event_id                500 non-null    object        
 11  id_du_troncon           489 non-null    object        
 12  has_event_near_troncon  489 non-null    float64   

In [None]:
def match_events(row, df_event):
    troncon_id = row['id']
    horodatage = row['horodatage']

    events = df_event[df_event['id_du_troncon'] == troncon_id]

    for _, event in events.iterrows():
        start_window_start = event['startdate'] - pd.Timedelta(minutes=30)
        start_window_end = event['startdate'] + pd.Timedelta(minutes=15)
        end_window_end = event['enddate'] + pd.Timedelta(minutes=30)

        if start_window_start <= horodatage <= start_window_end or event['enddate'] <= horodatage <= end_window_end:
            return event['has_event_near_troncon']

    return False

# Appliquer la fonction de matching
df_trafic_holidays_weather_pollution['has_event_near_troncon'] = df_trafic_holidays_weather_pollution.apply(match_events, axis=1, df_event=df_event_troncon)


                 id_technique    id  debit  longueur  taux_occupation  \
0        1081-20250314T055200  1081      0       245              0.0   
1        6126-20250314T055200  6126      0       482              0.0   
2        1126-20250314T055200  1126    120       292              2.0   
3        6120-20250314T055200  6120     60       450              0.8   
4        1120-20250314T055200  1120      0       496              0.0   
...                       ...   ...    ...       ...              ...   
1554756  5447-20250314T055200  5447    120       339              1.6   
1554757  5756-20250314T055200  5756      0       431              0.0   
1554758  5449-20250314T055200  5449     60       425              0.4   
1554759   446-20250314T055200   446     60       472              1.2   
1554760     6-20250314T055200     6      0       205              0.0   

         code_couleur    nom_du_troncon etat_du_trafic  temps_de_parcours  \
0                   2  Bourdonnieres P1    Ind

In [325]:
df_trafic_holidays_weather_pollution.loc[df_trafic_holidays_weather_pollution['has_event_near_troncon'] == True]['id'].unique()

array(['5319', '131', '5131', '319', '5097', '97', '98', '5098', '139',
       '5798', '5139', '798', '208', '5208', '5207'], dtype=object)

In [324]:
df_event_troncon.columns

Index(['name', 'description', 'startdate', 'enddate', 'location_name',
       'address', 'city', 'postalcode', 'location_uid', 'coordinates_geo',
       'event_id', 'id_du_troncon', 'has_event_near_troncon'],
      dtype='object')

In [326]:
df_event_troncon['id_du_troncon'].unique()

array(['98', '5098', '5097', '97', '5319', '5131', '131', '319', '5208',
       '5207', '208', '139', '5139', '5798', '798', nan], dtype=object)