### Importing libraries

In [1]:
import pandas as pd
import plotly.express as px
import os

## data gouv API call
import requests

# formating files (filenames, encodings, separators)
import yaml


### Requesting API

### Get datasets DL urls

In [2]:
def get_datasets_url(url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/'):
    r = requests.get(url).json()
    return {el['title']: el['latest'] for el in r['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules") }

lieux_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("lieux")}
usagers_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("usagers")}
car_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("car")}
vehicule_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("vehicule")}

all_urls = [lieux_datasets, usagers_datasets, car_datasets,vehicule_datasets]


### Downloading csv's if not already

In [3]:

for url_dict in all_urls:
    for path, url in url_dict.items():
        path = '../raw_data/' + path
        if not os.path.exists(path):
            response = requests.get(url)
            if response.status_code == 200:
                with open(path, 'wb') as f:
                    f.write(response.content)


In [4]:
folder = "../raw_data/"
os.listdir(folder)


['lieux_2019.csv',
 'lieux-2022.csv',
 'vehicules_2020.csv',
 'caracteristiques_2013.csv',
 'caracteristiques_2009.csv',
 'lieux_2013.csv',
 'vehicules_2005.csv',
 'vehicules_2012.csv',
 'vehicules_2013.csv',
 'usagers_2010.csv',
 'lieux_2010.csv',
 'carcteristiques-2022.csv',
 'caracteristiques_2006.csv',
 'usagers_2021.csv',
 'usagers_2022.csv',
 'vehicules_2019.csv',
 'carcteristiques-2021.csv',
 'caracteristiques_2017.csv',
 'usagers-2021.csv',
 'usagers_2019.csv',
 'lieux-2021.csv',
 'usagers_2016.csv',
 'vehicules-2019.csv',
 'caracteristiques_2016.csv',
 'vehicules-2018.csv',
 'caracteristiques_2022.csv',
 'caracteristiques_2011.csv',
 'lieux_2020.csv',
 'lieux_2018.csv',
 'lieux-2018.csv',
 'usagers_2012.csv',
 'caracteristiques-2017.csv',
 'caracteristiques-2020.csv',
 'vehicules-2017.csv',
 'vehicules-2021.csv',
 'caracteristiques_2020.csv',
 'lieux_2022.csv',
 'usagers-2019.csv',
 'caracteristiques_2007.csv',
 'caracteristiques_2012.csv',
 'caracteristiques-2018.csv',
 'usag

### Detect separator & read csv

In [5]:
# def detect_separator(file_path):
#     with open(file_path, 'r') as file:
#         first_line = file.readline()
#         if ';' in first_line:
#             return ';'
#         elif '|' in first_line:
#             return '|'
#         else:
#             return ','

# folder = "../raw_data/"
# dff = []
# files = [file for file in os.listdir(folder)]

# for file in files:
#     file_path = os.path.join(folder, file)
#     sep = detect_separator(file_path)
#     df = pd.read_csv(file_path, sep=sep)
#     dff.append(df)

# df_final = pd.concat(dff)


In [6]:
chemin_fichier_yml = '../config.yml'
chemin_dossier = '../raw_data/'


with open (chemin_fichier_yml, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    rename_config = config.get('rename')

for old_name, new_name in rename_config.items():
    chemin_ancien_fichier = os.path.join(chemin_dossier, old_name)
    chemin_nouveau_fichier = os.path.join(chemin_dossier, new_name)

    if os.path.exists(chemin_ancien_fichier):
        os.rename(chemin_ancien_fichier, chemin_nouveau_fichier)
        print(f"Fichier renommé : {old_name} -> {new_name}")
    else:
        print(f"Fichier non trouvé : {old_name}")


Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

In [7]:
def concat_files(starting_word):

    chemin_fichier_yml = '../config.yml'
    with open(chemin_fichier_yml, 'r') as f:
        config = yaml.safe_load(f)
        config_sep = config.get('sep')
        config_encoding = config.get('encoding')

    chemin_dossier = '../raw_data/'

    df_concat = pd.DataFrame()
    files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

    print(files)
    for file in files:
        chemin_fichier = os.path.join(chemin_dossier, file)

        if file in config_sep:
            sep = config_sep[file]
        else:
            sep = ','

        if file in config_encoding:
            encoding = config_encoding[file]
        else:
            encoding = 'utf-8'

        df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

        df_concat = pd.concat([df_concat, df1])

    return df_concat


In [8]:
carac_df = concat_files("caracteristiques")
lieux_df = concat_files("lieux")
usager_df = concat_files("usagers")
vehi_df = concat_files("vehicules")


['caracteristiques_2013.csv', 'caracteristiques_2009.csv', 'caracteristiques_2006.csv', 'caracteristiques_2017.csv', 'caracteristiques_2016.csv', 'caracteristiques_2022.csv', 'caracteristiques_2011.csv', 'caracteristiques_2020.csv', 'caracteristiques_2007.csv', 'caracteristiques_2012.csv', 'caracteristiques_2019.csv', 'caracteristiques_2021.csv', 'caracteristiques_2018.csv', 'caracteristiques_2005.csv', 'caracteristiques_2014.csv', 'caracteristiques_2010.csv', 'caracteristiques_2015.csv', 'caracteristiques_2008.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['lieux_2019.csv', 'lieux_2013.csv', 'lieux_2010.csv', 'lieux_2020.csv', 'lieux_2018.csv', 'lieux_2022.csv', 'lieux_2021.csv', 'lieux_2006.csv', 'lieux_2017.csv', 'lieux_2015.csv', 'lieux_2012.csv', 'lieux_2009.csv', 'lieux_2007.csv', 'lieux_2014.csv', 'lieux_2008.csv', 'lieux_2016.csv', 'lieux_2011.csv', 'lieux_2005.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['usagers_2010.csv', 'usagers_2021.csv', 'usagers_2022.csv', 'usagers_2019.csv', 'usagers_2016.csv', 'usagers_2012.csv', 'usagers_2015.csv', 'usagers_2011.csv', 'usagers_2020.csv', 'usagers_2006.csv', 'usagers_2014.csv', 'usagers_2008.csv', 'usagers_2017.csv', 'usagers_2018.csv', 'usagers_2007.csv', 'usagers_2009.csv', 'usagers_2005.csv', 'usagers_2013.csv']
['vehicules_2020.csv', 'vehicules_2005.csv', 'vehicules_2012.csv', 'vehicules_2013.csv', 'vehicules_2019.csv', 'vehicules_2014.csv', 'vehicules_2021.csv', 'vehicules_2006.csv', 'vehicules_2015.csv', 'vehicules_2009.csv', 'vehicules_2010.csv', 'vehicules_2016.csv', 'vehicules_2008.csv', 'vehicules_2018.csv', 'vehicules_2022.csv', 'vehicules_2011.csv', 'vehicules_2017.csv', 'vehicules_2007.csv']


In [9]:
# drop useless columns
vehi_df = vehi_df.drop(["id_vehicule", "motor", "num_veh", "occutc", "senc"], axis=1)

# get bike id's
id_bikes = vehi_df[vehi_df.catv == 1].Num_Acc.values

# create df of accidents involving bikes (w/ duplicated Num_Acc)
bikes_df = vehi_df[vehi_df.Num_Acc.isin(id_bikes)]

# get accidents by number of parties involved
grouped = bikes_df.groupby('Num_Acc').count()

accident_alone_idx = grouped[grouped.catv == 1].index #12k
accident_2p_idx = grouped[grouped.catv == 2].index #71k
accident_3p_idx = grouped[grouped.catv > 3].index #263


In [21]:
accident_2p = bikes_df[bikes_df.Num_Acc.isin(accident_2p_idx)]
accident_2p


Unnamed: 0,Num_Acc,catv,obs,obsm,choc,manv
17,202000000011,7,0.0,2.0,2.0,15.0
18,202000000011,1,0.0,2.0,5.0,19.0
21,202000000014,1,0.0,2.0,4.0,16.0
22,202000000014,7,0.0,2.0,2.0,16.0
49,202000000031,1,0.0,2.0,8.0,19.0
...,...,...,...,...,...,...
144043,200700084678,7,0.0,9.0,1.0,16.0
144061,200700084687,1,0.0,2.0,1.0,1.0
144062,200700084687,7,0.0,9.0,8.0,1.0
144081,200700084698,37,0.0,2.0,3.0,0.0


In [110]:
bikes_df.catv.


Unnamed: 0,Num_Acc,catv,obs,obsm,choc,manv
44,200700000029,1,0.0,2.0,1.0,19.0
45,200700000029,7,0.0,0.0,0.0,1.0
49,200700000032,2,0.0,2.0,1.0,5.0
50,200700000032,1,0.0,2.0,1.0,1.0
230,200700000136,1,0.0,2.0,1.0,1.0
...,...,...,...,...,...,...
97119,202100056404,1,0.0,2.0,1.0,25.0
97120,202100056404,7,0.0,2.0,1.0,1.0
97158,202100056424,7,0.0,2.0,2.0,9.0
97159,202100056424,1,0.0,2.0,1.0,1.0


In [59]:
vehi_df[vehi_df.catv == 1].isna().sum() / vehi_df[vehi_df.catv == 1].isna().shape[0]


Num_Acc        0.000000
senc           0.000304
catv           0.000000
occutc         0.235765
obs            0.000810
obsm           0.000743
choc           0.000394
manv           0.000405
num_veh        0.000000
id_vehicule    0.764235
motor          0.764235
dtype: float64

In [75]:
vehi_df[vehi_df.catv == 1].choc.value_counts()


 1.0    38014
 0.0    12228
 8.0    10905
 4.0     8628
 7.0     7050
 3.0     4890
 2.0     3631
 6.0     1826
 5.0     1193
 9.0      476
-1.0        9
Name: choc, dtype: int64

In [56]:
usager_df.id_usager.isna().sum() / usager_df.shape[0]


0.9029311816936652

In [58]:
vehi_df[vehi_df.Num_Acc.isin(id_bikes)]
#vehi_df.drop([])
vehi_df.catv.value_counts()


 7     1237634
 33     144087
 10     107333
 2      101713
 1       88885
 30      77039
 32      41609
 31      40339
 5       21816
 34      21166
 14      19832
 15      17301
 17      15954
 37      12969
 99       8283
 3        8072
 13       7624
 4        7463
 50       5116
 21       4177
 38       3496
 36       2574
 40       2305
 20       2150
 18       2075
 43       1995
 80       1793
 0        1030
 16        890
 60        754
 35        527
 39        503
 9         300
 19        183
 8         120
 42         83
 12         79
 41         57
 6          39
 11         17
-1          13
Name: catv, dtype: int64

In [28]:
vehi_df.columns


Index(['Num_Acc', 'senc', 'catv', 'occutc', 'obs', 'obsm', 'choc', 'manv',
       'num_veh', 'id_vehicule', 'motor'],
      dtype='object')

### Loading data

In [76]:
## accidents 2005 - 2021
data = pd.read_csv("../raw_data/Accidents/accidentsVelo.csv")

# data 2022
carac_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/carcteristiques-2022.csv', sep=";")
lieux_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/lieux-2022.csv', sep=";")
usagers_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/lieux-2022.csv', sep=";")
vehicule_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/vehicules-2022.csv', sep=";")

all_2022 = carac_2022.merge(lieux_2022, left_on="Accident_Id", right_on="Num_Acc").merge(vehicule_2022, left_on="Accident_Id", right_on="Num_Acc").merge(usagers_2022, left_on="Accident_Id", right_on="Num_Acc")



Columns (8,9,20,21,30) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



In [82]:
data


Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
0,200500000030,2005-01-13,2005,janvier,jeudi,19:45,62,62331,50.3,2.84,...,0,0,0.0,2.0,8.0,11.0,200500000030B02,18,17,1.0
1,200500000034,2005-01-19,2005,janvier,mercredi,10:45,62,62022,0.0,0.0,...,0,0,0.0,2.0,1.0,1.0,200500000034B02,10,15,1.0
2,200500000078,2005-01-26,2005,janvier,mercredi,13:15,02,02173,0.0,0.0,...,1,2,0.0,2.0,1.0,1.0,200500000078B02,7,15,1.0
3,200500000093,2005-01-03,2005,janvier,lundi,13:30,02,02810,49.255,3.094,...,0,0,0.0,2.0,3.0,21.0,200500000093B02,7,21,1.0
4,200500000170,2005-01-29,2005,janvier,samedi,18:30,76,76196,0.0,0.0,...,1,9,0.0,2.0,4.0,2.0,200500000170A01,10,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74753,202100056317,2021-01-02,2021,janvier,samedi,18:30,44,44168,473777890000,-21976410000,...,2,,0.0,0.0,8.0,1.0,202100056317B01,7,14,1.0
74754,202100056362,2021-01-04,2021,janvier,lundi,08:20,64,64138,432309460000,-02765840000,...,1,2,0.0,2.0,0.0,1.0,202100056362B01,7,15,1.0
74755,202100056404,2021-01-01,2021,janvier,vendredi,16:55,54,54395,486849869839,61760189384,...,1,2,0.0,2.0,1.0,25.0,202100056404A01,7,1,1.0
74756,202100056424,2021-01-02,2021,janvier,samedi,15:40,75,75110,488769050000,23665940000,...,2,,0.0,2.0,1.0,1.0,202100056424A01,7,9,1.0


In [93]:
all_bikes_2022 = all_2022[all_2022.catv == 1]


In [97]:
all_bikes_2022.merge(data, left_on="Accident_Id", right_on="Num_Acc")



Passing 'suffixes' which cause duplicate columns {'Num_Acc_x'} in the result is deprecated and will raise a MergeError in a future version.



Unnamed: 0,Accident_Id,jour_x,mois_x,an_x,hrmn_x,lum_x,dep_x,com_x,agg_x,int_x,...,secuexist,equipement,obs_y,obsm_y,choc_y,manv_y,vehiculeid,typevehicules,manoeuvehicules,numVehicules


In [95]:
#all_bikes_2022.merge(data, how='outer', left_on="Accident_Id", right_on="Num_Acc")



Passing 'suffixes' which cause duplicate columns {'Num_Acc_x'} in the result is deprecated and will raise a MergeError in a future version.



Unnamed: 0,Accident_Id,jour_x,mois_x,an_x,hrmn_x,lum_x,dep_x,com_x,agg_x,int_x,...,secuexist,equipement,obs_y,obsm_y,choc_y,manv_y,vehiculeid,typevehicules,manoeuvehicules,numVehicules
0,2.022000e+11,21.0,10.0,2022.0,16:32,1.0,75,75106,2.0,1.0,...,,,,,,,,,,
1,2.022000e+11,20.0,10.0,2022.0,13:00,1.0,75,75105,2.0,2.0,...,,,,,,,,,,
2,2.022000e+11,21.0,10.0,2022.0,11:25,1.0,75,75113,2.0,4.0,...,,,,,,,,,,
3,2.022000e+11,21.0,10.0,2022.0,15:50,1.0,75,75103,2.0,2.0,...,,,,,,,,,,
4,2.022000e+11,21.0,10.0,2022.0,19:40,5.0,93,93049,2.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80073,,,,,,,,,,,...,2.0,,0.0,0.0,8.0,1.0,202100056317B01,7,14,1.0
80074,,,,,,,,,,,...,1.0,2,0.0,2.0,0.0,1.0,202100056362B01,7,15,1.0
80075,,,,,,,,,,,...,1.0,2,0.0,2.0,1.0,25.0,202100056404A01,7,1,1.0
80076,,,,,,,,,,,...,2.0,,0.0,2.0,1.0,1.0,202100056424A01,7,9,1.0


In [49]:
data = data.query('lat != 0 & long != 0').reset_index(drop=True)
data.dep.nunique()


113

In [None]:
data


In [48]:
paris_df = data[data.dep == "75"]
mars_df = data[data.dep == "13"]


array(['13'], dtype=object)

In [45]:
paris_df


Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
3913,201000062968,2010-10-03,2010,octobre,dimanche,18:45,75,75112,48.83464,2.43893,...,1,2,0.0,0.0,5.0,17.0,201000062968A01,1,1,1.0
3914,201000062968,2010-10-03,2010,octobre,dimanche,18:45,75,75112,48.83464,2.43893,...,1,2,0.0,0.0,3.0,1.0,201000062968B01,1,17,1.0
12273,201500050797,2015-06-23,2015,juin,mardi,18:30,75,75103,48.52011,2.21485,...,1,2,0.0,1.0,1.0,3.0,201500050797A01,,,
12274,201500052215,2015-08-13,2015,août,jeudi,20:30,75,75110,48.52501,2.21432,...,1,2,0.0,2.0,0.0,1.0,201500052215A01,31,1,1.0
12275,201500055885,2015-06-27,2015,juin,samedi,14:20,75,75118,48.53181,2.2107,...,1,2,0.0,0.0,7.0,18.0,201500055885B01,7,22,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31614,202100056069,2021-01-06,2021,janvier,mercredi,16:15,75,75112,488446739656,24057128628,...,2,,0.0,2.0,8.0,1.0,202100056069B01,7,14,1.0
31616,202100056184,2021-01-04,2021,janvier,lundi,17:35,75,75103,488623960000,23555220000,...,1,2,0.0,0.0,1.0,21.0,202100056184A01,,,
31618,202100056276,2021-01-04,2021,janvier,lundi,18:25,75,75119,488816830000,23810550000,...,1,2/6,0.0,2.0,3.0,1.0,202100056276B01,7,17,1.0
31619,202100056283,2021-01-04,2021,janvier,lundi,19:40,75,75101,488649640000,23347680000,...,2,,0.0,1.0,1.0,1.0,202100056283A01,,,


In [42]:
mars_df.columns


Index(['Num_Acc', 'date', 'an', 'mois', 'jour', 'hrmn', 'dep', 'com', 'lat',
       'long', 'agg', 'int', 'col', 'lum', 'atm', 'catr', 'circ', 'nbv',
       'prof', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ', 'grav',
       'sexe', 'age', 'trajet', 'secuexist', 'equipement', 'obs', 'obsm',
       'choc', 'manv', 'vehiculeid', 'typevehicules', 'manoeuvehicules',
       'numVehicules'],
      dtype='object')

Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
209,200500026234,2005-05-05,2005,mai,jeudi,14:15,13,13110,43.449,5.689,...,0,0,0.0,2.0,1.0,5.0,200500026234B02,7,1,1.0
226,200500026972,2005-05-12,2005,mai,jeudi,12:15,13,13113,43.6,5.48,...,1,2,0.0,2.0,1.0,1.0,200500026972B02,7,15,1.0
227,200500027020,2005-05-11,2005,mai,mercredi,14:15,13,13083,43.916,4.808,...,1,2,0.0,2.0,1.0,1.0,200500027020A01,7,1,1.0
294,200500039391,2005-06-01,2005,juin,mercredi,19:30,13,13019,43.445,5.362,...,0,0,0.0,2.0,1.0,1.0,200500039391B02,7,16,1.0
295,200500039402,2005-06-24,2005,juin,vendredi,16:15,13,13027,43.896,4.832,...,1,4,0.0,2.0,8.0,13.0,200500039402A01,15,17,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30867,202100042976,2021-04-24,2021,avril,samedi,07:45,13,13032,435779040000,53365450000,...,1,2,0.0,2.0,1.0,1.0,202100042976B01,7,17,1.0
30868,202100042976,2021-04-24,2021,avril,samedi,07:45,13,13032,435779040000,53365450000,...,1,2,0.0,2.0,0.0,1.0,202100042976C01,7,17,1.0
31033,202100045502,2021-04-02,2021,avril,vendredi,12:25,13,13015,434573580000,54130790000,...,1,2,0.0,2.0,1.0,1.0,202100045502B01,7,15,1.0
31228,202100048088,2021-03-15,2021,mars,lundi,08:50,13,13100,437922950000,48394720000,...,2,,0.0,0.0,4.0,19.0,202100048088B01,7,1,1.0


In [41]:
fig = px.scatter_mapbox(
    mars_df,
    lat='lat',
    lon='long',
    height=600,
    zoom=12,
    color=mars_df['grav'],
    mapbox_style="carto-positron",
)

fig.update_layout(
    margin={"r":10,"t":10,"l":10,"b":10}  # Marges droite, haut, gauche, bas à 0
)

fig.show()


TypeError: unsupported operand type(s) for +: 'float' and 'str'