### Importing libraries

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import os

## data gouv API call
import requests

# formating files (filenames, encodings, separators)
import yaml


In [7]:
import numpy as np
import pandas as pd
import os
import requests
import yaml

#### DL, SAVE & CONCAT DATASETS ####
def get_datasets_url(url):
    response = requests.get(url).json()
    return {el['title']: el['latest'] for el in response['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules")}


def download_and_save_datasets(url_dict, save_path):
    for path, url in url_dict.items():
        full_path = os.path.join(save_path, path)
        if not os.path.exists(full_path):
            response = requests.get(url)
            if response.status_code == 200:
                with open(full_path, 'wb') as f:
                    f.write(response.content)


def rename_files(config_path, save_path):
    with open(config_path, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
        rename_config = config.get('rename')

    for old_name, new_name in rename_config.items():
        old_file_path = os.path.join(save_path, old_name)
        new_file_path = os.path.join(save_path, new_name)

        if os.path.exists(old_file_path):
            os.rename(old_file_path, new_file_path)
            print(f"Fichier renommé : {old_name} -> {new_name}")
        else:
            print(f"Fichier non trouvé : {old_name}")

def download_and_process_datasets(base_url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/',
                                  config_path='../config.yml',
                                  save_path='../raw_data/'):
    datasets = get_datasets_url(base_url)
    categories = ["lieux", "usagers", "car", "vehicule"]

    categorized_urls = {category: {i: j for i, j in datasets.items() if i.startswith(category)}
                        for category in categories}

    for category, url_dict in categorized_urls.items():
        download_and_save_datasets(url_dict, save_path)

    rename_files(config_path, save_path)

    return categorized_urls

def concat_files(starting_word):

    chemin_fichier_yml = '../config.yml'
    with open(chemin_fichier_yml, 'r') as f:
        config = yaml.safe_load(f)
        config_sep = config.get('sep')
        config_encoding = config.get('encoding')

    chemin_dossier = '../raw_data/'

    df_concat = pd.DataFrame()
    files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

    print(files)
    for file in files:
        chemin_fichier = os.path.join(chemin_dossier, file)

        if file in config_sep:
            sep = config_sep[file]
        else:
            sep = ','

        if file in config_encoding:
            encoding = config_encoding[file]
        else:
            encoding = 'utf-8'

        df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

        df_concat = pd.concat([df_concat, df1])

    return df_concat

#### CLEANING VEHICULES ####
def process_catv(df):
    df.loc[df.catv == -1, 'catv'] = df.catv.mode()[0]
    df = df[df['catv']==1]
    return df

def create_aug(df):
    df['aug'] = df['Num_Acc'].astype(str) + df['num_veh'].astype(str)
    return df

#### CLEANING CARACTERISTIQUES ####
def process_dates(df, year_col='an', month_col='mois', day_col='jour', time_col='hrmn'):
    # Checking for invalid dates
    invalid_dates = df[(df[day_col] > 31) | ((df[month_col] == 2) & (df[day_col] > 29)) | ((df[month_col].isin([4, 6, 9, 11])) & (df[day_col] > 30))]
    df = df.drop(invalid_dates.index)

    # Creating the 'date' column
    df['date'] = pd.to_datetime(df[year_col].astype(str).str.zfill(2)
                                + df[month_col].astype(str).str.zfill(2)
                                + df[day_col].astype(str).str.zfill(2)
                                + df[time_col].astype(str).str.zfill(4),
                                format='%y%m%d%H%M', errors='coerce')

    df = df.drop(columns=[year_col, month_col, day_col, time_col])

    return df

def drop_nans(df, columns):
    return df.dropna(subset=columns)

def impute_invalid_values(df, columns, invalid_values):
    for column in columns:
        for invalid_value in invalid_values:
            # Separate handling for NaN values
            if pd.isna(invalid_value) or (isinstance(invalid_value, pd._libs.tslibs.nattype.NaTType) and pd.api.types.is_datetime64_any_dtype(df[column])):
                # calculate distribution excluding NaN and other invalid values
                distribution = df[column][~df[column].isin(invalid_values)].value_counts(normalize=True)

                # assign new values
                new_values = np.random.choice(distribution.index, size=df[column].isna().sum(), p=distribution.values)
                df.loc[df[column].isna(), column] = new_values
            else:
                # calculate distribution for a given value, excluding other invalid values
                distribution = df[column][~df[column].isin(invalid_values)].value_counts(normalize=True)

                # assign new values
                new_values = np.random.choice(distribution.index, size=(df[column] == invalid_value).sum(), p=distribution.values)
                df.loc[df[column] == invalid_value, column] = new_values
    return df

def replace_with_most_frequent(df, columns, invalid_value=-1):
    for column in columns:
        most_frequent = df[column].mode()[0]
        df[column] = df[column].replace(invalid_value, most_frequent)
    return df

def filter_df_on_column(df, column, valid_values):
    return df.loc[df[column].isin(valid_values)]

#### CLEANING LIEUX ###
def clean_column(df, column, replace_dict, fill_value):
    df[column] = df[column].replace(replace_dict)
    df[column].fillna(fill_value, inplace=True)
    return df

def clean_nbv(df):
    df['nbv'] = pd.to_numeric(df['nbv'], errors='coerce')
    df = clean_column(df, 'nbv', {-1: 2}, 2)
    df['nbv'] = df['nbv'].where(df['nbv'] <= 10, 2)
    return df

def clean_catr(df):
    df['catr'].fillna(4, inplace=True)
    return df

#### CLEANING USAGERS ####
def process_grav_sexe(df, column):
    df[column] = df[column].replace({-1: 1}).fillna(1)
    return df

def process_secu(df):
    df['secu'].fillna(df['secu1'], inplace=True)
    df['secu'] = df['secu'].where(df['secu'] <= 0, 1)
    return df

def process_actp(df):
    df['actp'] = pd.to_numeric(df['actp'], errors='coerce').fillna(0).astype(int)
    return df

def process_locp_etatp(df, columns):
    for column in columns:
        df[column] = df[column].replace({-1: 0}).fillna(0).astype(int)
    return df

def process_an_nais(df):
    df.dropna(subset=['an_nais'], inplace=True)
    mean_an_nais = df['an_nais'].mean()
    df['an_nais'] = df['an_nais'].replace({0: mean_an_nais})
    return df

def drop_irrelevant_columns(df, columns):
    return df.drop(columns=columns)

def process_trajet(df):
    trajet_impute = df['trajet'].fillna(5)
    distribution = trajet_impute[~trajet_impute.isin([0, -1])].value_counts(normalize=True)
    missing_values = trajet_impute.isin([0, -1])
    trajet_impute[missing_values] = np.random.choice(distribution.index, size=missing_values.sum(), p=distribution.values)
    df['trajet'] = trajet_impute.astype(int).replace({-1: 1})
    return df

def create_dup_count(df):
    df['dup_count'] = df.groupby('aug')['aug'].transform('count')
    df = df.drop_duplicates(subset=['aug'], keep='first')
    return df

def concat_datasets():
    # concat datasets
    carac_df = concat_files("caracteristiques")
    lieux_df = concat_files("lieux")
    usager_df = concat_files("usagers")
    vehi_df = concat_files("vehicules")
    return vehi_df, carac_df, lieux_df, usager_df

def clean_datasets(vehi_df, carac_df, lieux_df, usager_df):
    # clean vehicules
    vehi_df_clean = (vehi_df
                    .pipe(drop_irrelevant_columns, ["id_vehicule", "motor", "occutc", "senc"])
                    .pipe(process_catv)
                    .pipe(create_aug)
                    .pipe(impute_invalid_values, ['obs', 'obsm', 'choc', 'manv'], [-1, np.NaN]))

    # clean caracteristiques
    carac_df_clean = (carac_df
                    .pipe(drop_irrelevant_columns, ['gps', 'Accident_Id'])
                    .pipe(process_dates)
                    .pipe(impute_invalid_values, ['date', 'atm'], [np.NaN])
                    .pipe(drop_nans, ['Num_Acc', 'col', 'com'])
                    .pipe(impute_invalid_values, ['lum', 'int', 'col'], [-1])
                    .pipe(replace_with_most_frequent, ['lum', 'agg', 'atm']))

    # clean lieux
    lieux_df_clean = (lieux_df
                    .pipe(clean_column, 'situ', {-1: 1, 0: 1}, 1)
                    .pipe(clean_column, 'circ', {0: 2, -1: 2}, 2)
                    .pipe(clean_nbv)
                    .pipe(clean_column, 'vosp', {-1: 0}, 0)
                    .pipe(clean_column, 'prof', {-1: 1, 0: 1}, 1)
                    .pipe(clean_column, 'plan', {-1: 1, 0: 1}, 1)
                    .pipe(clean_column, 'surf', {-1: 1, 0: 1, 9: 1}, 1)
                    .pipe(clean_column, 'infra', {-1: 1}, 1)
                    .pipe(clean_catr)
                    .pipe(drop_irrelevant_columns, ['voie', 'v1', 'v2', 'pr', 'pr1', 'lartpc', 'larrout', 'vma', 'env1']))

    # clean usagers
    usager_df_clean = (usager_df
                    .pipe(process_grav_sexe, column='grav')
                    .pipe(process_grav_sexe, column='sexe')
                    .pipe(process_trajet)
                    .pipe(process_secu)
                    .pipe(process_actp)
                    .pipe(process_locp_etatp, columns=['locp', 'etatp'])
                    .pipe(process_an_nais)
                    .pipe(create_aug)
                    .pipe(create_dup_count)
                    .pipe(drop_irrelevant_columns, columns=['place', 'catu', 'secu1', 'secu2', 'secu3', 'num_veh', 'id_vehicule', 'id_usager']))

    return vehi_df_clean, carac_df_clean, lieux_df_clean, usager_df_clean

def merge_cleaned_datasets(vehi_df_clean, carac_df_clean, lieux_df_clean, usager_df_clean):
    vehi_usa = pd.merge(vehi_df_clean, usager_df_clean, on='aug', how='inner')
    vehi_usa = vehi_usa.drop(columns=['num_veh', 'Num_Acc_y'])
    vehi_usa = vehi_usa.rename(columns={'Num_Acc_x': 'Num_Acc'})

    duplicates_velo = vehi_usa[vehi_usa['Num_Acc'].duplicated(keep=False)]

    velo_df = pd.merge(vehi_usa, lieux_df_clean, on='Num_Acc', how='left')
    all_datasets = pd.merge(velo_df, carac_df_clean, on='Num_Acc', how='left')

    return all_datasets, duplicates_velo


In [19]:
vehi_df

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh,id_vehicule,motor
0,200500000001,0.0,7,0.0,0.0,2.0,1.0,1.0,A01,,
1,200500000001,0.0,7,0.0,0.0,2.0,8.0,10.0,B02,,
2,200500000002,0.0,7,0.0,0.0,2.0,7.0,16.0,A01,,
3,200500000002,0.0,2,0.0,0.0,2.0,1.0,1.0,B02,,
4,200500000003,0.0,2,0.0,0.0,2.0,1.0,1.0,A01,,
...,...,...,...,...,...,...,...,...,...,...,...
105809,201200062248,0.0,30,0.0,0.0,2.0,7.0,15.0,A01,,
105810,201200062249,0.0,1,0.0,0.0,2.0,4.0,1.0,B01,,
105811,201200062249,0.0,1,0.0,0.0,2.0,4.0,1.0,A01,,
105812,201200062249,0.0,2,0.0,0.0,2.0,1.0,1.0,C01,,


In [8]:
download_and_process_datasets()
vehi_df, carac_df, lieux_df, usager_df = concat_datasets()
vehi_df_clean, carac_df_clean, lieux_df_clean, usager_df_clean = clean_datasets(vehi_df, carac_df, lieux_df, usager_df)

Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['lieux_2015.csv', 'lieux_2010.csv', 'lieux_2005.csv', 'lieux_2019.csv', 'lieux_2008.csv', 'lieux_2011.csv', 'lieux_2006.csv', 'lieux_2017.csv', 'lieux_2018.csv', 'lieux_2014.csv', 'lieux_2022.csv', 'lieux_2009.csv', 'lieux_2007.csv', 'lieux_2020.csv', 'lieux_2021.csv', 'lieux_2013.csv', 'lieux_2012.csv', 'lieux_2016.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['usagers_2006.csv', 'usagers_2021.csv', 'usagers_2020.csv', 'usagers_2014.csv', 'usagers_2010.csv', 'usagers_2007.csv', 'usagers_2005.csv', 'usagers_2018.csv', 'usagers_2019.csv', 'usagers_2016.csv', 'usagers_2017.csv', 'usagers_2022.csv', 'usagers_2008.csv', 'usagers_2009.csv', 'usagers_2013.csv', 'usagers_2015.csv', 'usagers_2011.csv', 'usagers_2012.csv']
['vehicules_2005.csv', 'vehicules_2018.csv', 'vehicules_2011.csv', 'vehicules_2013.csv', 'vehicules_2017.csv', 'vehicules_2006.csv', 'vehicules_2016.csv', 'vehicules_2020.csv', 'vehicules_2008.csv', 'vehicules_2014.csv', 'vehicules_2009.csv', 'vehicules_2022.csv', 'vehicules_2007.csv', 'vehicules_2021.csv', 'vehicules_2019.csv', 'vehicules_2015.csv', 'vehicules_2010.csv', 'vehicules_2012.csv']


In [12]:
all_datasets, duplicates_velo = merge_cleaned_datasets(vehi_df_clean, carac_df_clean, lieux_df_clean, usager_df_clean)

Unnamed: 0,Num_Acc,catv,obs,obsm,choc,manv,aug,grav,sexe,trajet,...,agg,int,atm,col,com,adr,lat,long,dep,date
0,200500000030,1,0.0,2.0,8.0,11.0,200500000030B02,4,1,5,...,2.0,1.0,1.0,3.0,331.0,rue de la chapelle,5030000.0,284000.0,620,2005-01-13 19:45:00
1,200500000034,1,0.0,2.0,1.0,1.0,200500000034B02,3,1,5,...,1.0,1.0,7.0,1.0,22.0,,0.0,0.0,620,2005-01-19 10:45:00
2,200500000078,1,0.0,2.0,1.0,1.0,200500000078B02,4,1,5,...,1.0,9.0,1.0,3.0,173.0,,0.0,0.0,20,2005-01-26 13:15:00
3,200500000093,1,0.0,2.0,3.0,21.0,200500000093B02,3,2,4,...,2.0,1.0,1.0,1.0,810.0,rue du grand montoir,4925500.0,309400.0,20,2005-01-03 13:30:00
4,200500000170,1,0.0,2.0,4.0,2.0,200500000170A01,4,1,5,...,1.0,1.0,1.0,2.0,196.0,,0.0,0.0,760,2005-01-29 18:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88516,201200062214,1,0.0,2.0,1.0,1.0,201200062214B01,4,2,5,...,2.0,1.0,1.0,3.0,16,"74, HIPPOLYTE PIOT(RUE)",,,974,2012-10-31 20:50:00
88517,201200062217,1,0.0,0.0,8.0,1.0,201200062217B01,4,1,9,...,2.0,1.0,1.0,3.0,16,"NEANT, CHATEAU D'EAU (AL",,,974,2012-11-05 15:15:00
88518,201200062230,1,0.0,2.0,3.0,1.0,201200062230A01,4,1,5,...,2.0,1.0,1.0,3.0,16,"SANS, ENTRE DEUX (CHEMIN",,,974,2012-11-18 07:15:00
88519,201200062249,1,0.0,2.0,4.0,1.0,201200062249B01,3,1,9,...,1.0,1.0,1.0,4.0,16,"SANS, RN1 (ROUTE NATIONA",,,974,2012-12-23 06:20:00


In [15]:
all_datasets.isna().sum()[all_datasets.isna().sum() > 0 ]

lum      5291
agg      5291
int      5291
atm      5291
col      5291
com      5291
adr     11542
lat     40603
long    40603
dep      5291
date     5291
dtype: int64

In [22]:
# catv_group_named = {
#     "Bicycles_ElectricScooters": [1, 80, 50, 60],  # Bicycles, E-bikes, and Personal Mobility Devices
#     "Cars": [3, 7, 8, 9],  # Cars including light vehicles, VL + caravane, VL + remorque
#     "Motorcycles_Scooters": [2, 30, 31, 32, 33, 34, 41, 42, 43, 4, 5, 6],  # Motorcycles, scooters, scooter immatriculé, motocyclette, side-car
#     "HeavyVehicles_Buses": [10, 13, 14, 15, 16, 17, 37, 38, 18, 19],  # Utility vehicles, heavy trucks, buses, transport en commun, tramway
#     "Others_SpecialVehicles": [0, 20, 21, 39, 40, 99, 11, 12, 35, 36]  # Special vehicles and others, VU + caravane, VU + remorque
# }


# obs_group_named = {
#     "NoObstacle": [-1, 0],  # Without obstacle
#     "WithObstacle": list(range(1, 18))  # With obstacle
# }

# obsm_group_named = {
#     "Pedestrian_Vehicle_Rail": [1, 2, 4],  # Pedestrian, Vehicle, Rail Vehicle
#     "Animals": [5, 6],  # Animals
#     "Others_NotClassified": [-1, 0, 9]  # Other or Not Classified
# }

# choc_group_named = {
#     "NoImpact": [0],  # No impact
#     "Impact": list(range(1, 10))  # Impact
# }

# manv_group_named = {
#     "BasicManeuvers": [1, 2, 3],  # Standard driving maneuvers
#     "DirectionalChanges": [11, 12, 13, 14, 15, 16, 17, 18],  # Maneuvers involving directional changes
#     "DefensiveManeuvers": [21, 22],  # Defensive driving maneuvers
#     "TrajectoryChanges": [10],  # Significant trajectory changes
#     "RiskyManeuvers": [4, 5, 6, 7, 8, 9, 19, 26],  # Unusual or risky maneuvers
#     "StationaryParkingManeuvers": [20, 23, 24, 25]  # Stationary or parking related maneuvers
# }


Unnamed: 0,Num_Acc,lum,agg,int,atm,col,com,adr,lat,long,dep,date
0,2.006000e+11,1,2,2,1.0,3.0,53,"SANS N°, PONT DES CHEVRE",,,10,2006-01-04 15:45:00
1,2.006000e+11,2,2,2,4.0,3.0,53,BROU ( BD DU N° 47 A 65,,,10,2006-01-06 08:05:00
2,2.006000e+11,1,2,1,1.0,6.0,53,"sans, CLAVAGRY ( RUE)",,,10,2006-01-09 13:40:00
3,2.006000e+11,2,2,1,8.0,3.0,53,23EME R.I. ( RUE DU),,,10,2006-01-10 16:25:00
4,2.006000e+11,1,1,1,1.0,2.0,53,MARBOZ (AVENUEDE - IMPAI,,,10,2006-01-24 11:20:00
...,...,...,...,...,...,...,...,...,...,...,...,...
87021,2.005001e+11,5,2,2,1.0,3.0,416.0,"sans, LEBLOND(RUE M. ET",,,974,2005-12-21 20:35:00
87022,2.005001e+11,1,2,1,1.0,3.0,416.0,"SANS, PRESIDENT MITTERAN",,,974,2005-12-23 10:10:00
87023,2.005001e+11,1,2,2,1.0,3.0,416.0,"SANS, LEBLOND(RUE M. ET",,,974,2005-12-26 17:15:00
87024,2.005001e+11,1,2,3,2.0,5.0,416.0,"SANS, HUBERT DE LISLE(BO",,,974,2005-12-27 15:00:00


In [9]:
# Création d'une colonne uniques pour merger les df usagers et véhicules
# usagers_df['aug'] = usagers_df['Num_Acc'].astype(str) + usagers_df['num_veh'].astype(str)
# vehicules_df['aug'] = vehicules_df['Num_Acc'].astype(str) + vehicules_df['num_veh'].astype(str)

# vehicules_df.shape
# vehicules_df = vehicules_df[vehicules_df['catv']==1]


# On drop les duplicates lorsqu'il y a une plusieurs usagers dans un même véhicules et pour le même accident
# On crée une colonne pour y ajouter le nombre d'usagers par véhicules


# usagers_df['dup_count'] = usagers_df.groupby('aug')['aug'].transform('count')
# usagers_df = usagers_df.drop_duplicates(subset=['aug'], keep='first')

# On merge véhicules et usagers puis on ne garde que les accidents impliquant un vélo
# On clean et on drop les doublons

vehi_usa = pd.merge(vehicules_df, usagers_df, on='aug', how='inner')
vehi_usa = vehi_usa.drop(columns=['num_veh_x', 'num_veh_y', 'Num_Acc_y'])
vehi_usa = vehi_usa.rename(columns={'Num_Acc_x': 'Num_Acc'})

duplicates_velo = vehi_usa[vehi_usa['Num_Acc'].duplicated(keep=False)]
duplicates_velo.info()

velo_df = pd.merge(vehi_usa, lieux_df, on='Num_Acc', how='left')
velo_df = pd.merge(velo_df, caracteristiques_df, on='Num_Acc', how='left')



# On remplace les valeurs manquantes par la valeur la plus fréquente
velo_df['lum'].fillna(velo_df['lum'].mode()[0], inplace=True)
velo_df['agg'].fillna(velo_df['agg'].mode()[0], inplace=True)
velo_df['atm'].fillna(velo_df['atm'].mode()[0], inplace=True)
velo_df['atm'] = velo_df['atm'].replace({ -1: 1, 9: 1})


# # Obtenez la distribution des valeurs
# distribution = velo_df['int'].value_counts(normalize=True)
# # Obtenez les valeurs manquantes
# missing = velo_df['int'].isnull()
# # Remplissez les valeurs manquantes en fonction de la distribution
# velo_df.loc[missing, 'int'] = np.random.choice(distribution.index, size=len(velo_df[missing]), p=distribution.values)

# # Obtenez la distribution des valeurs
# distribution = velo_df['col'].value_counts(normalize=True)
# # Obtenez les valeurs manquantes
# missing = velo_df['col'].isnull()
# # Remplissez les valeurs manquantes en fonction de la distribution
# velo_df.loc[missing, 'col'] = np.random.choice(distribution.index, size=len(velo_df[missing]), p=distribution.values)
# Je remplace le -1 par 1
velo_df['col'] = velo_df['col'].replace({ -1: 3})

velo_df = velo_df[~(velo_df.adr.isna() & velo_df.lat.isna() & velo_df.long.isna() & velo_df.com.isna() & velo_df.dep.isna())]

# Réinitialisez l'index de votre DataFrame
velo_df = velo_df.reset_index(drop=True)

# Obtenez la distribution des valeurs
distribution = velo_df['date'].value_counts(normalize=True)

# Obtenez les valeurs manquantes
missing = velo_df['date'].isnull()

# Remplissez les valeurs manquantes en fonction de la distribution
velo_df.loc[missing, 'date'] = np.random.choice(distribution.index, size=len(velo_df[missing]), p=distribution.values)

Unnamed: 0,Num_Acc,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,200600000001,4,1,5,1.0,0,0,0,1949.0
1,200600000001,4,2,5,1.0,0,0,0,1948.0
2,200600000001,1,2,5,1.0,0,0,0,1921.0
3,200600000002,4,2,1,1.0,0,0,0,1972.0
4,200600000002,4,2,2,1.0,0,0,0,1984.0
...,...,...,...,...,...,...,...,...,...
138623,201200062248,4,1,5,1.0,0,0,0,1994.0
138624,201200062249,3,1,9,1.0,0,0,0,1987.0
138625,201200062249,3,1,5,1.0,0,0,0,1957.0
138626,201200062249,1,1,9,1.0,0,0,0,1991.0


In [3]:
def get_datasets_url(url):
    response = requests.get(url).json()
    return {el['title']: el['latest'] for el in response['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules")}


def download_and_save_datasets(url_dict, save_path):
    for path, url in url_dict.items():
        full_path = os.path.join(save_path, path)
        if not os.path.exists(full_path):
            response = requests.get(url)
            if response.status_code == 200:
                with open(full_path, 'wb') as f:
                    f.write(response.content)


def rename_files(config_path, save_path):
    with open(config_path, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
        rename_config = config.get('rename')

    for old_name, new_name in rename_config.items():
        old_file_path = os.path.join(save_path, old_name)
        new_file_path = os.path.join(save_path, new_name)

        if os.path.exists(old_file_path):
            os.rename(old_file_path, new_file_path)
            print(f"Fichier renommé : {old_name} -> {new_name}")
        else:
            print(f"Fichier non trouvé : {old_name}")

def download_and_process_datasets(base_url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/',
                                  config_path='../config.yml',
                                  save_path='../raw_data/'):
    datasets = get_datasets_url(base_url)
    categories = ["lieux", "usagers", "car", "vehicule"]

    categorized_urls = {category: {i: j for i, j in datasets.items() if i.startswith(category)}
                        for category in categories}

    for category, url_dict in categorized_urls.items():
        download_and_save_datasets(url_dict, save_path)

    rename_files(config_path, save_path)

    return categorized_urls


def concat_files(starting_word):

    chemin_fichier_yml = '../config.yml'
    with open(chemin_fichier_yml, 'r') as f:
        config = yaml.safe_load(f)
        config_sep = config.get('sep')
        config_encoding = config.get('encoding')

    chemin_dossier = '../raw_data/'

    df_concat = pd.DataFrame()
    files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

    print(files)
    for file in files:
        chemin_fichier = os.path.join(chemin_dossier, file)

        if file in config_sep:
            sep = config_sep[file]
        else:
            sep = ','

        if file in config_encoding:
            encoding = config_encoding[file]
        else:
            encoding = 'utf-8'

        df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

        df_concat = pd.concat([df_concat, df1])

    return df_concat

download_and_process_datasets()

carac_df = concat_files("caracteristiques")
lieux_df = concat_files("lieux")
usager_df = concat_files("usagers")
vehi_df = concat_files("vehicules")

Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['lieux_2015.csv', 'lieux_2010.csv', 'lieux_2005.csv', 'lieux_2019.csv', 'lieux_2008.csv', 'lieux_2011.csv', 'lieux_2006.csv', 'lieux_2017.csv', 'lieux_2018.csv', 'lieux_2014.csv', 'lieux_2022.csv', 'lieux_2009.csv', 'lieux_2007.csv', 'lieux_2020.csv', 'lieux_2021.csv', 'lieux_2013.csv', 'lieux_2012.csv', 'lieux_2016.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['usagers_2006.csv', 'usagers_2021.csv', 'usagers_2020.csv', 'usagers_2014.csv', 'usagers_2010.csv', 'usagers_2007.csv', 'usagers_2005.csv', 'usagers_2018.csv', 'usagers_2019.csv', 'usagers_2016.csv', 'usagers_2017.csv', 'usagers_2022.csv', 'usagers_2008.csv', 'usagers_2009.csv', 'usagers_2013.csv', 'usagers_2015.csv', 'usagers_2011.csv', 'usagers_2012.csv']
['vehicules_2005.csv', 'vehicules_2018.csv', 'vehicules_2011.csv', 'vehicules_2013.csv', 'vehicules_2017.csv', 'vehicules_2006.csv', 'vehicules_2016.csv', 'vehicules_2020.csv', 'vehicules_2008.csv', 'vehicules_2014.csv', 'vehicules_2009.csv', 'vehicules_2022.csv', 'vehicules_2007.csv', 'vehicules_2021.csv', 'vehicules_2019.csv', 'vehicules_2015.csv', 'vehicules_2010.csv', 'vehicules_2012.csv']


In [58]:
usager_df['grav'] = usager_df['grav'].replace({-1: 1})
usager_df['grav'].fillna(1, inplace=True)

usager_df['sexe'] = usager_df['sexe'].replace({-1: 1})
usager_df['sexe'].fillna(1, inplace=True)
# on remplace les valeurs nulles par la plus fréquentes

# Supposons que votre DataFrame s'appelle usager_df et la colonne en question 'trajet'
colonne_a_imputer = usager_df['trajet'].fillna(5)

# Obtenir la distribution des valeurs existantes sans tenir compte de 0 et -1
distribution = colonne_a_imputer[~colonne_a_imputer.isin([0, -1])].value_counts(normalize=True)

# Remplacer les valeurs manquantes par échantillonnage de la distribution
valeurs_manquantes = (colonne_a_imputer.isin([0, -1]))
colonne_a_imputer[valeurs_manquantes] = np.random.choice(distribution.index, size=valeurs_manquantes.sum(), p=distribution.values)

# Assurez-vous que la colonne est de type entier (si nécessaire)
usager_df['trajet'] = colonne_a_imputer.astype(int)

# Remplacez les valeurs -1 par 1 (comme indiqué dans votre code d'origine)
usager_df['trajet'] = usager_df['trajet'].replace({-1: 1})

# usager_df['secu'].fillna(usager_df['secu1'])
# #on rempalce les valeurs nulles par les valeurs nulles de secu (avant 2019) par les valeurs de secu1(après 2019)
# usager_df['secu'] = usager_df['secu'].where(usager_df['secu'] <= 0, 1)
# # on normalise les valeurs secu

# # Remplacer les valeurs -1 par 0 dans 'locp'
# usager_df['locp'] = usager_df['locp'].replace({-1: 0})

# # Remplacer les valeurs manquantes dans 'locp' par 0
# usager_df['locp'].fillna(0, inplace=True)


# usager_df['actp'].value_counts()
# usager_df['actp'].fillna(0, inplace=True)
# usager_df['actp'] = pd.to_numeric(usager_df['actp'], errors='coerce').fillna(0).astype(int)
# usager_df['actp'].astype(float).astype(int)
# usager_df['actp'] = usager_df['actp'].replace({-1:0})
# # on normalise les valeurs actp

# usager_df['etatp'].value_counts()
# usager_df['etatp'] = usager_df['etatp'].replace({-1:0})
# usager_df['etatp'].fillna(0, inplace=True)
# # onconcat les valeurs manquantes

# # Supposons que votre DataFrame s'appelle usager_df
# # Drop les lignes avec des valeurs manquantes dans 'an_nais'
# usager_df.dropna(subset=['an_nais'], inplace=True)

# # Remplacer les valeurs 0 dans 'an_nais' par la moyenne
# moyenne_an_nais = usager_df['an_nais'].mean()
# usager_df['an_nais'] = usager_df['an_nais'].replace({0: moyenne_an_nais})


# usager_df_clean = usager_df.drop(columns=['place','catu', 'secu1','secu2','secu3', 'num_veh','id_vehicule','id_usager'])
# # on drop les colonnes non pertinentes

In [59]:
usager_df.trajet.value_counts()

5    1362349
1     480089
4     355726
9     263044
3      98860
2      76309
Name: trajet, dtype: int64

In [56]:
process_trajet(usager_df).trajet.value_counts()

5    1361651
1     480237
4     355937
9     263487
3      98873
2      76192
Name: trajet, dtype: int64

In [5]:
# Assuming usager_df is your initial DataFrame
processed_usager_df = (usager_df
                .pipe(process_grav_sexe, column='grav')
                .pipe(process_grav_sexe, column='sexe')
                .pipe(process_trajet)
                .pipe(process_secu)
                .pipe(process_actp)
                .pipe(process_locp_etatp, columns=['locp', 'etatp'])
                .pipe(process_an_nais)
                .pipe(drop_irrelevant_columns, columns=['place', 'catu', 'secu1', 'secu2', 'secu3', 'num_veh', 'id_vehicule', 'id_usager']))


 0    2241993
-1     185666
 3     159399
 9      14751
 1      12228
 5      11933
 2       6014
 4       3767
 6        514
 7         60
 8         52
Name: actp, dtype: int64

In [32]:
usager_df[['locp', 'actp', 'etatp']]["actp"].value_counts()

0      1913376
0       270499
 -1     185666
3       131491
3        27908
9        12219
5        10401
1         9751
2         4851
4         3236
9         2532
1         2477
5         1532
B         1234
2         1163
4          531
A          422
6          408
6          106
7           60
8           52
Name: actp, dtype: int64

In [20]:
handle_locp_actp_etatp(usager_df, ['locp', 'actp', 'etatp'])

ValueError: invalid literal for int() with base 10: 'B'

In [18]:
usager_df.trajet.isna().sum()

0

In [6]:
usager_df_clean

Unnamed: 0,Num_Acc,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,200600000001,4,1,5,1.0,0.0,0,0.0,1949.0
1,200600000001,4,2,5,1.0,0.0,0,0.0,1948.0
2,200600000001,1,2,5,1.0,0.0,0,0.0,1921.0
3,200600000002,4,2,1,1.0,0.0,0,0.0,1972.0
4,200600000002,4,2,2,1.0,0.0,0,0.0,1984.0
...,...,...,...,...,...,...,...,...,...
138623,201200062248,4,1,5,1.0,0.0,0,0.0,1994.0
138624,201200062249,3,1,9,1.0,0.0,0,0.0,1987.0
138625,201200062249,3,1,5,1.0,0.0,0,0.0,1957.0
138626,201200062249,1,1,9,1.0,0.0,0,0.0,1991.0


In [49]:
processed_usager_df

Unnamed: 0,Num_Acc,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,200600000001,4,1,5,1.0,0,0,0,1949.0
1,200600000001,4,2,1,1.0,0,0,0,1948.0
2,200600000001,1,2,5,1.0,0,0,0,1921.0
3,200600000002,4,2,1,1.0,0,0,0,1972.0
4,200600000002,4,2,2,1.0,0,0,0,1984.0
...,...,...,...,...,...,...,...,...,...
138623,201200062248,4,1,5,1.0,0,0,0,1994.0
138624,201200062249,3,1,9,1.0,0,0,0,1987.0
138625,201200062249,3,1,5,1.0,0,0,0,1957.0
138626,201200062249,1,1,9,1.0,0,0,0,1991.0


In [11]:
def clean_column(df, column, replace_dict, fill_value, to_numeric=False):
    if to_numeric:
        df[column] = pd.to_numeric(df[column], errors='coerce')
    df[column] = df[column].replace(replace_dict)
    df[column].fillna(fill_value, inplace=True)
    return df

def impute_from_distribution(df, column, invalid_values):
    distribution = df[column][~df[column].isin(invalid_values)].value_counts(normalize=True)
    missing_values = df[column].isin(invalid_values)
    df.loc[missing_values, column] = np.random.choice(distribution.index, size=missing_values.sum(), p=distribution.values)
    return df


usager_df = clean_column(usager_df, 'grav', {-1: 1}, 1)
usager_df = clean_column(usager_df, 'sexe', {-1: 1}, 1)
usager_df = impute_from_distribution(usager_df, 'trajet', [0, -1])
usager_df['trajet'] = usager_df['trajet'].astype(int)

# Handling 'secu' column
usager_df['secu'].fillna(usager_df['secu1'], inplace=True)
usager_df = clean_column(usager_df, 'secu', {value: 1 for value in usager_df['secu'] if value > 0}, 1)

usager_df = clean_column(usager_df, 'locp', {-1: 0}, 0)
usager_df = clean_column(usager_df, 'actp', {-1: 0}, 0, to_numeric=True)
usager_df = clean_column(usager_df, 'etatp', {-1: 0}, 0)

# Handling 'an_nais' column
usager_df.dropna(subset=['an_nais'], inplace=True)
average_an_nais = usager_df['an_nais'].mean()
usager_df['an_nais'] = usager_df['an_nais'].replace({0: average_an_nais})

# Dropping unnecessary columns
usager_df_clean = usager_df.drop(columns=['place', 'catu', 'secu1', 'secu2', 'secu3', 'num_veh', 'id_vehicule', 'id_usager'])


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [8]:
usager_df_clean

Unnamed: 0,Num_Acc,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,200600000001,4,1,5,1.0,0.0,0,0.0,1949.0
1,200600000001,4,2,5,1.0,0.0,0,0.0,1948.0
2,200600000001,1,2,5,1.0,0.0,0,0.0,1921.0
3,200600000002,4,2,1,1.0,0.0,0,0.0,1972.0
4,200600000002,4,2,2,1.0,0.0,0,0.0,1984.0
...,...,...,...,...,...,...,...,...,...
138623,201200062248,4,1,5,1.0,0.0,0,0.0,1994.0
138624,201200062249,3,1,9,1.0,0.0,0,0.0,1987.0
138625,201200062249,3,1,5,1.0,0.0,0,0.0,1957.0
138626,201200062249,1,1,9,1.0,0.0,0,0.0,1991.0


In [83]:
# import os
# import requests
# import yaml

# def download_and_process_datasets(base_url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/',
#                                   config_path='../config.yml',
#                                   save_path='../raw_data/'):
#     # Function to get datasets URL
#     def get_datasets_url(url=base_url):
#         r = requests.get(url).json()
#         return {el['title']: el['latest'] for el in r['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules") }

#     # Categorizing datasets
#     datasets = get_datasets_url()
#     categories = {
#         "lieux": "lieux",
#         "usagers": "usagers",
#         "car": "car",
#         "vehicule": "vehicule"
#     }

#     categorized_urls = {category: {i: j for i, j in datasets.items() if i.startswith(prefix)}
#                             for category, prefix in categories.items()}

#     # Downloading and saving datasets
#     for category, url_dict in categorized_urls.items():
#         for path, url in url_dict.items():
#             full_path = os.path.join(save_path, path)
#             if not os.path.exists(full_path):
#                 response = requests.get(url)
#                 if response.status_code == 200:
#                     with open(full_path, 'wb') as f:
#                         f.write(response.content)

#     # Loading configuration for renaming
#     with open(config_path, 'r') as f:
#         config = yaml.load(f, Loader=yaml.FullLoader)
#         rename_config = config.get('rename')

#     # Renaming files
#     for old_name, new_name in rename_config.items():
#         old_file_path = os.path.join(save_path, old_name)
#         new_file_path = os.path.join(save_path, new_name)

#         if os.path.exists(old_file_path):
#             os.rename(old_file_path, new_file_path)
#             print(f"Fichier renommé : {old_name} -> {new_name}")
#         else:
#             print(f"Fichier non trouvé : {old_name}")

#     return categorized_urls

# def concat_files(starting_word):

#     chemin_fichier_yml = '../config.yml'
#     with open(chemin_fichier_yml, 'r') as f:
#         config = yaml.safe_load(f)
#         config_sep = config.get('sep')
#         config_encoding = config.get('encoding')

#     chemin_dossier = '../raw_data/'

#     df_concat = pd.DataFrame()
#     files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

#     print(files)
#     for file in files:
#         chemin_fichier = os.path.join(chemin_dossier, file)

#         if file in config_sep:
#             sep = config_sep[file]
#         else:
#             sep = ','

#         if file in config_encoding:
#             encoding = config_encoding[file]
#         else:
#             encoding = 'utf-8'

#         df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

#         df_concat = pd.concat([df_concat, df1])

#     return df_concat


# download_and_process_datasets()

# carac_df = concat_files("caracteristiques")
# lieux_df = concat_files("lieux")
# usager_df = concat_files("usagers")
# vehi_df = concat_files("vehicules")


Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

{'lieux': {'lieux-2022.csv': 'https://www.data.gouv.fr/fr/datasets/r/a6ef711a-1f03-44cb-921a-0ce8ec975995',
  'lieux-2021.csv': 'https://www.data.gouv.fr/fr/datasets/r/8a4935aa-38cd-43af-bf10-0209d6d17434',
  'lieux-2020.csv': 'https://www.data.gouv.fr/fr/datasets/r/e85c41f7-d4ea-4faf-877f-ab69a620ce21',
  'lieux-2019.csv': 'https://www.data.gouv.fr/fr/datasets/r/2ad65965-36a1-4452-9c08-61a6c874e3e6',
  'lieux-2018.csv': 'https://www.data.gouv.fr/fr/datasets/r/d9d65ca1-16a3-4ea3-b7c8-2412c92b69d9',
  'lieux-2017.csv': 'https://www.data.gouv.fr/fr/datasets/r/9b76a7b6-3eef-4864-b2da-1834417e305c',
  'lieux_2016.csv': 'https://www.data.gouv.fr/fr/datasets/r/08b77510-39c4-4761-bf02-19457264790f',
  'lieux_2015.csv': 'https://www.data.gouv.fr/fr/datasets/r/31db21ef-4328-4c5e-bf3d-66a8fe82e6a2',
  'lieux_2014.csv': 'https://www.data.gouv.fr/fr/datasets/r/617af155-1b7c-41d6-9504-576878c4d9af',
  'lieux_2013.csv': 'https://www.data.gouv.fr/fr/datasets/r/1e00e4dd-e204-4a08-9e10-9b8a02791ba9',
 

In [90]:
catv_group_named = {
    "Bicycles_ElectricScooters": [1, 80, 50, 60],  # Bicycles, E-bikes, and Personal Mobility Devices
    "Cars": [3, 7, 8, 9],  # Cars including light vehicles, VL + caravane, VL + remorque
    "Motorcycles_Scooters": [2, 30, 31, 32, 33, 34, 41, 42, 43, 4, 5, 6],  # Motorcycles, scooters, scooter immatriculé, motocyclette, side-car
    "HeavyVehicles_Buses": [10, 13, 14, 15, 16, 17, 37, 38, 18, 19],  # Utility vehicles, heavy trucks, buses, transport en commun, tramway
    "Others_SpecialVehicles": [0, 20, 21, 39, 40, 99, 11, 12, 35, 36]  # Special vehicles and others, VU + caravane, VU + remorque
}


obs_group_named = {
    "NoObstacle": [-1, 0],  # Without obstacle
    "WithObstacle": list(range(1, 18))  # With obstacle
}

obsm_group_named = {
    "Pedestrian_Vehicle_Rail": [1, 2, 4],  # Pedestrian, Vehicle, Rail Vehicle
    "Animals": [5, 6],  # Animals
    "Others_NotClassified": [-1, 0, 9]  # Other or Not Classified
}

choc_group_named = {
    "NoImpact": [0],  # No impact
    "Impact": list(range(1, 10))  # Impact
}

manv_group_named = {
    "BasicManeuvers": [1, 2, 3],  # Standard driving maneuvers
    "DirectionalChanges": [11, 12, 13, 14, 15, 16, 17, 18],  # Maneuvers involving directional changes
    "DefensiveManeuvers": [21, 22],  # Defensive driving maneuvers
    "TrajectoryChanges": [10],  # Significant trajectory changes
    "RiskyManeuvers": [4, 5, 6, 7, 8, 9, 19, 26],  # Unusual or risky maneuvers
    "StationaryParkingManeuvers": [20, 23, 24, 25]  # Stationary or parking related maneuvers
}

def clean_and_transform_data(df, catv_group_inverted, choc_group_inverted, obs_group_inverted, obsm_group_inverted, manv_group_inverted):
    # Drop unnecessary columns
    df_modif = df.drop(["id_vehicule", "motor", "occutc", "senc"], axis=1)

    # CATV: Assign mode to -1 values
    df_modif.loc[df_modif.catv == -1, 'catv'] = df_modif.catv.mode()[0]

    # Define a function for cleaning and imputing values based on distribution
    def clean_and_impute(column):
        # Calculate distribution of values greater than or equal to 0
        values_distribution = df_modif[column][df_modif[column] >= 0].value_counts(normalize=True)

        # Impute NaNs and -1 values
        for condition in [df_modif[column].isna(), df_modif[column] == -1]:
            new_values = np.random.choice(values_distribution.index, size=condition.sum(), p=values_distribution.values)
            df_modif.loc[condition, column] = new_values

    # Apply the cleaning and imputing function to specified columns
    for col in ['obs', 'obsm', 'choc', 'manv']:
        clean_and_impute(col)

    # Mapping to group values
    df_modif['catv'] = df_modif['catv'].map(catv_group_inverted)
    df_modif['choc'] = df_modif['choc'].map(choc_group_inverted)
    df_modif['obs'] = df_modif['obs'].map(obs_group_inverted)
    df_modif['obsm'] = df_modif['obsm'].map(obsm_group_inverted)
    df_modif['manv'] = df_modif['manv'].map(manv_group_inverted)

    return df_modif

# Example usage
cleaned_df = clean_and_transform_data(vehi_df, catv_group_inverted, choc_group_inverted, obs_group_inverted, obsm_group_inverted, manv_group_inverted)


### Requesting API

### Get datasets DL urls

In [65]:
def get_datasets_url(url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/'):
    r = requests.get(url).json()
    return {el['title']: el['latest'] for el in r['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules") }

lieux_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("lieux")}
usagers_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("usagers")}
car_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("car")}
vehicule_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("vehicule")}

all_urls = [lieux_datasets, usagers_datasets, car_datasets,vehicule_datasets]


### Downloading csv's if not already

In [66]:

for url_dict in all_urls:
    for path, url in url_dict.items():
        path = '../raw_data/' + path
        if not os.path.exists(path):
            response = requests.get(url)
            if response.status_code == 200:
                with open(path, 'wb') as f:
                    f.write(response.content)

# folder = "../raw_data/"
# os.listdir(folder)


['lieux_2015.csv',
 'usagers_2006.csv',
 'caracteristiques_2006.csv',
 'caracteristiques_2016.csv',
 'caracteristiques_2020.csv',
 'caracteristiques_2008.csv',
 'usagers-2017.csv',
 'vehicules-2021.csv',
 'usagers_2021.csv',
 'caracteristiques-2017.csv',
 'usagers-2020.csv',
 'lieux_2010.csv',
 'usagers_2020.csv',
 'vehicules_2005.csv',
 'lieux_2005.csv',
 'caracteristiques_2022.csv',
 'usagers-2018.csv',
 'lieux-2018.csv',
 'lieux_2019.csv',
 'vehicules-2017.csv',
 'lieux_2008.csv',
 'caracteristiques_2012.csv',
 'vehicules_2018.csv',
 'usagers_2014.csv',
 'caracteristiques_2007.csv',
 'usagers_2010.csv',
 'vehicules_2011.csv',
 'vehicules-2022.csv',
 'usagers_2007.csv',
 'vehicules_2013.csv',
 'lieux_2011.csv',
 'lieux_2006.csv',
 'lieux_2017.csv',
 'usagers_2005.csv',
 'lieux_2018.csv',
 'lieux_2014.csv',
 'caracteristiques-2020.csv',
 'usagers_2018.csv',
 'lieux_2022.csv',
 'usagers-2019.csv',
 'usagers_2019.csv',
 'carcteristiques-2022.csv',
 'caracteristiques_2011.csv',
 'caracte

### Detect separator & read csv

In [67]:
# def detect_separator(file_path):
#     with open(file_path, 'r') as file:
#         first_line = file.readline()
#         if ';' in first_line:
#             return ';'
#         elif '|' in first_line:
#             return '|'
#         else:
#             return ','

# folder = "../raw_data/"
# dff = []
# files = [file for file in os.listdir(folder)]

# for file in files:
#     file_path = os.path.join(folder, file)
#     sep = detect_separator(file_path)
#     df = pd.read_csv(file_path, sep=sep)
#     dff.append(df)

# df_final = pd.concat(dff)


### changing file names w/ YAML

In [68]:
chemin_fichier_yml = '../config.yml'
chemin_dossier = '../raw_data/'


with open (chemin_fichier_yml, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    rename_config = config.get('rename')

# change file names
for old_name, new_name in rename_config.items():
    chemin_ancien_fichier = os.path.join(chemin_dossier, old_name)
    chemin_nouveau_fichier = os.path.join(chemin_dossier, new_name)

    if os.path.exists(chemin_ancien_fichier):
        os.rename(chemin_ancien_fichier, chemin_nouveau_fichier)
        print(f"Fichier renommé : {old_name} -> {new_name}")
    else:
        print(f"Fichier non trouvé : {old_name}")


Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

### changing files encoding w/ YAML + concatenating 

In [69]:
def concat_files(starting_word):

    chemin_fichier_yml = '../config.yml'
    with open(chemin_fichier_yml, 'r') as f:
        config = yaml.safe_load(f)
        config_sep = config.get('sep')
        config_encoding = config.get('encoding')

    chemin_dossier = '../raw_data/'

    df_concat = pd.DataFrame()
    files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

    print(files)
    for file in files:
        chemin_fichier = os.path.join(chemin_dossier, file)

        if file in config_sep:
            sep = config_sep[file]
        else:
            sep = ','

        if file in config_encoding:
            encoding = config_encoding[file]
        else:
            encoding = 'utf-8'

        df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

        df_concat = pd.concat([df_concat, df1])

    return df_concat

carac_df = concat_files("caracteristiques")
lieux_df = concat_files("lieux")
usager_df = concat_files("usagers")
vehi_df = concat_files("vehicules")


['caracteristiques_2006.csv', 'caracteristiques_2016.csv', 'caracteristiques_2020.csv', 'caracteristiques_2008.csv', 'caracteristiques_2022.csv', 'caracteristiques_2012.csv', 'caracteristiques_2007.csv', 'caracteristiques_2011.csv', 'caracteristiques_2021.csv', 'caracteristiques_2013.csv', 'caracteristiques_2014.csv', 'caracteristiques_2009.csv', 'caracteristiques_2017.csv', 'caracteristiques_2019.csv', 'caracteristiques_2018.csv', 'caracteristiques_2010.csv', 'caracteristiques_2015.csv', 'caracteristiques_2005.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['lieux_2015.csv', 'lieux_2010.csv', 'lieux_2005.csv', 'lieux_2019.csv', 'lieux_2008.csv', 'lieux_2011.csv', 'lieux_2006.csv', 'lieux_2017.csv', 'lieux_2018.csv', 'lieux_2014.csv', 'lieux_2022.csv', 'lieux_2009.csv', 'lieux_2007.csv', 'lieux_2020.csv', 'lieux_2021.csv', 'lieux_2013.csv', 'lieux_2012.csv', 'lieux_2016.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['usagers_2006.csv', 'usagers_2021.csv', 'usagers_2020.csv', 'usagers_2014.csv', 'usagers_2010.csv', 'usagers_2007.csv', 'usagers_2005.csv', 'usagers_2018.csv', 'usagers_2019.csv', 'usagers_2016.csv', 'usagers_2017.csv', 'usagers_2022.csv', 'usagers_2008.csv', 'usagers_2009.csv', 'usagers_2013.csv', 'usagers_2015.csv', 'usagers_2011.csv', 'usagers_2012.csv']
['vehicules_2005.csv', 'vehicules_2018.csv', 'vehicules_2011.csv', 'vehicules_2013.csv', 'vehicules_2017.csv', 'vehicules_2006.csv', 'vehicules_2016.csv', 'vehicules_2020.csv', 'vehicules_2008.csv', 'vehicules_2014.csv', 'vehicules_2009.csv', 'vehicules_2022.csv', 'vehicules_2007.csv', 'vehicules_2021.csv', 'vehicules_2019.csv', 'vehicules_2015.csv', 'vehicules_2010.csv', 'vehicules_2012.csv']


### Cleaning vehicule dataset

In [70]:
# drop useless columns
vehi_df_modif = vehi_df.drop(["id_vehicule", "motor", "occutc", "senc"], axis=1)

### REMOVING ACCIDENTS DUPLICATES ###
# get bike id's
id_bikes = vehi_df_modif[vehi_df_modif.catv == 1].Num_Acc.values

# create df of accidents involving bikes (w/ duplicated Num_Acc)
bikes_df = vehi_df_modif[vehi_df_modif.Num_Acc.isin(id_bikes)]

# get accidents by number of parties involved
grouped = bikes_df.groupby('Num_Acc').count()

# get ids
accident_alone_idx = grouped[grouped.catv == 1].index #12k
accident_2p_idx = grouped[grouped.catv == 2].index #71k
accident_3p_idx = grouped[grouped.catv > 3].index #263

# filter dataset
accident_2p = bikes_df[bikes_df.Num_Acc.isin(accident_2p_idx)]


In [71]:
catv_group_named = {
    "Bicycles_ElectricScooters": [1, 80, 50, 60],  # Bicycles, E-bikes, and Personal Mobility Devices
    "Cars": [3, 7, 8, 9],  # Cars including light vehicles, VL + caravane, VL + remorque
    "Motorcycles_Scooters": [2, 30, 31, 32, 33, 34, 41, 42, 43, 4, 5, 6],  # Motorcycles, scooters, scooter immatriculé, motocyclette, side-car
    "HeavyVehicles_Buses": [10, 13, 14, 15, 16, 17, 37, 38, 18, 19],  # Utility vehicles, heavy trucks, buses, transport en commun, tramway
    "Others_SpecialVehicles": [0, 20, 21, 39, 40, 99, 11, 12, 35, 36]  # Special vehicles and others, VU + caravane, VU + remorque
}


obs_group_named = {
    "NoObstacle": [-1, 0],  # Without obstacle
    "WithObstacle": list(range(1, 18))  # With obstacle
}

obsm_group_named = {
    "Pedestrian_Vehicle_Rail": [1, 2, 4],  # Pedestrian, Vehicle, Rail Vehicle
    "Animals": [5, 6],  # Animals
    "Others_NotClassified": [-1, 0, 9]  # Other or Not Classified
}

choc_group_named = {
    "NoImpact": [0],  # No impact
    "Impact": list(range(1, 10))  # Impact
}

manv_group_named = {
    "BasicManeuvers": [1, 2, 3],  # Standard driving maneuvers
    "DirectionalChanges": [11, 12, 13, 14, 15, 16, 17, 18],  # Maneuvers involving directional changes
    "DefensiveManeuvers": [21, 22],  # Defensive driving maneuvers
    "TrajectoryChanges": [10],  # Significant trajectory changes
    "RiskyManeuvers": [4, 5, 6, 7, 8, 9, 19, 26],  # Unusual or risky maneuvers
    "StationaryParkingManeuvers": [20, 23, 24, 25]  # Stationary or parking related maneuvers
}

# Inverting the dictionaries for mapping
catv_group_inverted = {v: k for k, values in catv_group_named.items() for v in values}
obs_group_inverted = {v: k for k, values in obs_group_named.items() for v in values}
obsm_group_inverted = {v: k for k, values in obsm_group_named.items() for v in values}
choc_group_inverted = {v: k for k, values in choc_group_named.items() for v in values}
manv_group_inverted = {v: k for k, values in manv_group_named.items() for v in values}


In [72]:
# drop useless columns
vehi_df_modif = vehi_df.drop(["id_vehicule", "motor", "occutc", "senc"], axis=1)


### CATV ###
# assign mode to -1 values in catv
vehi_df_modif.catv[vehi_df_modif.catv == -1] = vehi_df_modif.catv.mode()[0]

### CLEANING OBS ###
# calculate 'obs' values (>0) distribution
values_distribution_obs = vehi_df_modif.obs[vehi_df_modif.obs >= 0].value_counts(normalize=True)

# impute NaN's w/ 'obs' based on distribution
new_values_obs = np.random.choice(values_distribution_obs.index, size=vehi_df_modif['obs'].isna().sum(), p=values_distribution_obs.values)
vehi_df_modif.loc[vehi_df_modif['obs'].isna(), 'obs'] = new_values_obs

# Repeat for -1 values (not reported)
new_values_obs2 = np.random.choice(values_distribution_obs.index, size=(vehi_df_modif['obs'] == -1).sum(), p=values_distribution_obs.values)
vehi_df_modif.loc[vehi_df_modif['obs'] == -1, 'obs'] = new_values_obs2

# mapping to group 'obs' values


### CLEANING OBSM ###
# calculate 'obsm' values (>0) distribution
values_distribution_obsm = vehi_df_modif.obsm[vehi_df_modif.obsm >= 0].value_counts(normalize=True)

# impute NaN's w/ 'obsm' based on distribution
new_values_obsm = np.random.choice(values_distribution_obsm.index, size=vehi_df_modif['obsm'].isna().sum(), p=values_distribution_obsm.values)
vehi_df_modif.loc[vehi_df_modif['obsm'].isna(), 'obsm'] = new_values_obsm

# Repeat for -1 values (not reported)
new_values_obsm2 = np.random.choice(values_distribution_obsm.index, size=(vehi_df_modif['obsm'] == -1).sum(), p=values_distribution_obsm.values)
vehi_df_modif.loc[vehi_df_modif['obsm'] == -1, 'obsm'] = new_values_obsm2

# mapping to group 'obsm' values

### CLEANING CHOC ###
# Calculate 'choc' values (>0) distribution
values_distribution_choc = vehi_df_modif.choc[vehi_df_modif.choc > 0].value_counts(normalize=True)

# Impute NaN's in 'choc' based on distribution
new_values_choc = np.random.choice(values_distribution_choc.index, size=vehi_df_modif['choc'].isna().sum(), p=values_distribution_choc.values)
vehi_df_modif.loc[vehi_df_modif['choc'].isna(), 'choc'] = new_values_choc

# Repeat for -1 values (not reported)
new_values_choc2 = np.random.choice(values_distribution_choc.index, size=(vehi_df_modif['choc'] == -1).sum(), p=values_distribution_choc.values)
vehi_df_modif.loc[vehi_df_modif['choc'] == -1, 'choc'] = new_values_choc2

# Repeat for 0 values (no impact)
# new_values_choc3 = np.random.choice(values_distribution_choc.index, size=(vehi_df_modif['choc'] == 0).sum(), p=values_distribution_choc.values)
# vehi_df_modif.loc[vehi_df_modif['choc'] == 0, 'choc'] = new_values_choc3

# mapping to group 'choc' values


### CLEANING MANV ###
# Calculate 'manv' values (>0) distribution
values_distribution_manv = vehi_df_modif.manv[vehi_df_modif.manv > 0].value_counts(normalize=True)

# Impute NaN's in 'manv' based on distribution
new_values_manv = np.random.choice(values_distribution_manv.index, size=vehi_df_modif['manv'].isna().sum(), p=values_distribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'].isna(), 'manv'] = new_values_manv

# Repeat for -1 values (not reported)
new_values_manv2 = np.random.choice(values_distribution_manv.index, size=(vehi_df_modif['manv'] == -1).sum(), p=values_distribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'] == -1, 'manv'] = new_values_manv2

# Repeat for 0 values (no maneuver)
new_values_manv3 = np.random.choice(values_distribution_manv.index, size=(vehi_df_modif['manv'] == 0).sum(), p=values_distribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'] == 0, 'manv'] = new_values_manv3

# mapping to group 'choc' values
vehi_df_modif['catv'] = vehi_df_modif['catv'].map(catv_group_inverted)
vehi_df_modif['choc'] = vehi_df_modif['choc'].map(choc_group_inverted)
vehi_df_modif['obs'] = vehi_df_modif['obs'].map(obs_group_inverted)
vehi_df_modif['obsm'] = vehi_df_modif['obsm'].map(obsm_group_inverted)
vehi_df_modif['manv'] = vehi_df_modif['manv'].map(manv_group_inverted)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vehi_df_modif.catv[vehi_df_modif.catv == -1] = vehi_df_modif.catv.mode()[0]


In [77]:
vehi_df_modif.to_csv('vehicule_cleaned.csv', index=False)

In [78]:
vehi_df_modif.nunique()

Num_Acc    1176873
catv             5
obs              2
obsm             3
choc             2
manv             6
num_veh        189
dtype: int64

In [None]:
carac_df_modif = carac_df.drop(['gps', 'Accident_Id'], axis=1, inplace=False)


"""Cleaning date's column"""

def process_dates(df):
    # Vérifier les dates invalides
    invalid_dates = df[(df['jour'] > 31) | ((df['mois'] == 2) & (df['jour'] > 29)) | ((df['mois'].isin([4, 6, 9, 11])) & (df['jour'] > 30))]
    df = df.drop(invalid_dates.index)

    # Créer la colonne 'date'
    df['date'] = pd.to_datetime(df['an'].astype(str).str.zfill(2)
                                + df['mois'].astype(str).str.zfill(2)
                                + df['jour'].astype(str).str.zfill(2)
                                + df['hrmn'].astype(str).str.zfill(4),
                                format='%y%m%d%H%M', errors='coerce')

    df = df.drop(columns=['an', 'mois', 'jour', 'hrmn'])
    return df

# Utiliser la fonction sur carac_df_modif
carac_df_modif = process_dates(carac_df_modif)

"""Drop Na de Num_Acc"""

carac_df_modif = carac_df_modif.dropna(subset=['Num_Acc'])


"""Clean the column 'lum'"""

values_destribution_lum = carac_df_modif.lum[carac_df_modif['lum'] >= 0].value_counts(normalize=True)
new_values_lum = np.random.choice(values_destribution_lum.index, size = (carac_df_modif['lum'] == -1).sum(), p=values_destribution_lum.values)
carac_df_modif.loc[carac_df_modif['lum'] == -1, 'lum'] = new_values_lum


"""Delete wrong values"""

carac_df_modif = carac_df_modif.loc[carac_df_modif['int'] != -1]


"""Clean the column 'col'"""

carac_df_modif = carac_df_modif.dropna(subset=['col'])
most_frequent = carac_df_modif['col'].mode()[0]
carac_df_modif['col'] = carac_df_modif['col'].replace(-1, most_frequent)


"""Clean the column 'atm'"""

carac_df_modif = carac_df_modif.dropna(subset=['atm'])
most_frequent = carac_df_modif['atm'].mode()[0]
carac_df_modif['atm'] = carac_df_modif['atm'].replace(-1, most_frequent)


"""Clean the column 'com'"""

carac_df_modif = carac_df_modif.dropna(subset=['com'])


### Loading data

In [None]:
## accidents 2005 - 2021
data = pd.read_csv("../raw_data/Accidents/accidentsVelo.csv")

# data 2022
carac_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/carcteristiques-2022.csv', sep=";")
lieux_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/lieux-2022.csv', sep=";")
usagers_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/lieux-2022.csv', sep=";")
vehicule_2022 = pd.read_csv('/home/axl/CyclingFacilities/raw_data/Accidents/vehicules-2022.csv', sep=";")

all_2022 = carac_2022.merge(lieux_2022, left_on="Accident_Id", right_on="Num_Acc").merge(vehicule_2022, left_on="Accident_Id", right_on="Num_Acc").merge(usagers_2022, left_on="Accident_Id", right_on="Num_Acc")



Columns (8,9,20,21,30) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



In [None]:
data


Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
0,200500000030,2005-01-13,2005,janvier,jeudi,19:45,62,62331,50.3,2.84,...,0,0,0.0,2.0,8.0,11.0,200500000030B02,18,17,1.0
1,200500000034,2005-01-19,2005,janvier,mercredi,10:45,62,62022,0.0,0.0,...,0,0,0.0,2.0,1.0,1.0,200500000034B02,10,15,1.0
2,200500000078,2005-01-26,2005,janvier,mercredi,13:15,02,02173,0.0,0.0,...,1,2,0.0,2.0,1.0,1.0,200500000078B02,7,15,1.0
3,200500000093,2005-01-03,2005,janvier,lundi,13:30,02,02810,49.255,3.094,...,0,0,0.0,2.0,3.0,21.0,200500000093B02,7,21,1.0
4,200500000170,2005-01-29,2005,janvier,samedi,18:30,76,76196,0.0,0.0,...,1,9,0.0,2.0,4.0,2.0,200500000170A01,10,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74753,202100056317,2021-01-02,2021,janvier,samedi,18:30,44,44168,473777890000,-21976410000,...,2,,0.0,0.0,8.0,1.0,202100056317B01,7,14,1.0
74754,202100056362,2021-01-04,2021,janvier,lundi,08:20,64,64138,432309460000,-02765840000,...,1,2,0.0,2.0,0.0,1.0,202100056362B01,7,15,1.0
74755,202100056404,2021-01-01,2021,janvier,vendredi,16:55,54,54395,486849869839,61760189384,...,1,2,0.0,2.0,1.0,25.0,202100056404A01,7,1,1.0
74756,202100056424,2021-01-02,2021,janvier,samedi,15:40,75,75110,488769050000,23665940000,...,2,,0.0,2.0,1.0,1.0,202100056424A01,7,9,1.0


In [None]:
all_bikes_2022 = all_2022[all_2022.catv == 1]


In [None]:
all_bikes_2022.merge(data, left_on="Accident_Id", right_on="Num_Acc")



Passing 'suffixes' which cause duplicate columns {'Num_Acc_x'} in the result is deprecated and will raise a MergeError in a future version.



Unnamed: 0,Accident_Id,jour_x,mois_x,an_x,hrmn_x,lum_x,dep_x,com_x,agg_x,int_x,...,secuexist,equipement,obs_y,obsm_y,choc_y,manv_y,vehiculeid,typevehicules,manoeuvehicules,numVehicules


In [None]:
#all_bikes_2022.merge(data, how='outer', left_on="Accident_Id", right_on="Num_Acc")



Passing 'suffixes' which cause duplicate columns {'Num_Acc_x'} in the result is deprecated and will raise a MergeError in a future version.



Unnamed: 0,Accident_Id,jour_x,mois_x,an_x,hrmn_x,lum_x,dep_x,com_x,agg_x,int_x,...,secuexist,equipement,obs_y,obsm_y,choc_y,manv_y,vehiculeid,typevehicules,manoeuvehicules,numVehicules
0,2.022000e+11,21.0,10.0,2022.0,16:32,1.0,75,75106,2.0,1.0,...,,,,,,,,,,
1,2.022000e+11,20.0,10.0,2022.0,13:00,1.0,75,75105,2.0,2.0,...,,,,,,,,,,
2,2.022000e+11,21.0,10.0,2022.0,11:25,1.0,75,75113,2.0,4.0,...,,,,,,,,,,
3,2.022000e+11,21.0,10.0,2022.0,15:50,1.0,75,75103,2.0,2.0,...,,,,,,,,,,
4,2.022000e+11,21.0,10.0,2022.0,19:40,5.0,93,93049,2.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80073,,,,,,,,,,,...,2.0,,0.0,0.0,8.0,1.0,202100056317B01,7,14,1.0
80074,,,,,,,,,,,...,1.0,2,0.0,2.0,0.0,1.0,202100056362B01,7,15,1.0
80075,,,,,,,,,,,...,1.0,2,0.0,2.0,1.0,25.0,202100056404A01,7,1,1.0
80076,,,,,,,,,,,...,2.0,,0.0,2.0,1.0,1.0,202100056424A01,7,9,1.0


In [None]:
data = data.query('lat != 0 & long != 0').reset_index(drop=True)
data.dep.nunique()


113

In [None]:
data


In [None]:
paris_df = data[data.dep == "75"]
mars_df = data[data.dep == "13"]


array(['13'], dtype=object)

In [None]:
paris_df


Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
3913,201000062968,2010-10-03,2010,octobre,dimanche,18:45,75,75112,48.83464,2.43893,...,1,2,0.0,0.0,5.0,17.0,201000062968A01,1,1,1.0
3914,201000062968,2010-10-03,2010,octobre,dimanche,18:45,75,75112,48.83464,2.43893,...,1,2,0.0,0.0,3.0,1.0,201000062968B01,1,17,1.0
12273,201500050797,2015-06-23,2015,juin,mardi,18:30,75,75103,48.52011,2.21485,...,1,2,0.0,1.0,1.0,3.0,201500050797A01,,,
12274,201500052215,2015-08-13,2015,août,jeudi,20:30,75,75110,48.52501,2.21432,...,1,2,0.0,2.0,0.0,1.0,201500052215A01,31,1,1.0
12275,201500055885,2015-06-27,2015,juin,samedi,14:20,75,75118,48.53181,2.2107,...,1,2,0.0,0.0,7.0,18.0,201500055885B01,7,22,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31614,202100056069,2021-01-06,2021,janvier,mercredi,16:15,75,75112,488446739656,24057128628,...,2,,0.0,2.0,8.0,1.0,202100056069B01,7,14,1.0
31616,202100056184,2021-01-04,2021,janvier,lundi,17:35,75,75103,488623960000,23555220000,...,1,2,0.0,0.0,1.0,21.0,202100056184A01,,,
31618,202100056276,2021-01-04,2021,janvier,lundi,18:25,75,75119,488816830000,23810550000,...,1,2/6,0.0,2.0,3.0,1.0,202100056276B01,7,17,1.0
31619,202100056283,2021-01-04,2021,janvier,lundi,19:40,75,75101,488649640000,23347680000,...,2,,0.0,1.0,1.0,1.0,202100056283A01,,,


In [None]:
mars_df.columns


Index(['Num_Acc', 'date', 'an', 'mois', 'jour', 'hrmn', 'dep', 'com', 'lat',
       'long', 'agg', 'int', 'col', 'lum', 'atm', 'catr', 'circ', 'nbv',
       'prof', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ', 'grav',
       'sexe', 'age', 'trajet', 'secuexist', 'equipement', 'obs', 'obsm',
       'choc', 'manv', 'vehiculeid', 'typevehicules', 'manoeuvehicules',
       'numVehicules'],
      dtype='object')

Unnamed: 0,Num_Acc,date,an,mois,jour,hrmn,dep,com,lat,long,...,secuexist,equipement,obs,obsm,choc,manv,vehiculeid,typevehicules,manoeuvehicules,numVehicules
209,200500026234,2005-05-05,2005,mai,jeudi,14:15,13,13110,43.449,5.689,...,0,0,0.0,2.0,1.0,5.0,200500026234B02,7,1,1.0
226,200500026972,2005-05-12,2005,mai,jeudi,12:15,13,13113,43.6,5.48,...,1,2,0.0,2.0,1.0,1.0,200500026972B02,7,15,1.0
227,200500027020,2005-05-11,2005,mai,mercredi,14:15,13,13083,43.916,4.808,...,1,2,0.0,2.0,1.0,1.0,200500027020A01,7,1,1.0
294,200500039391,2005-06-01,2005,juin,mercredi,19:30,13,13019,43.445,5.362,...,0,0,0.0,2.0,1.0,1.0,200500039391B02,7,16,1.0
295,200500039402,2005-06-24,2005,juin,vendredi,16:15,13,13027,43.896,4.832,...,1,4,0.0,2.0,8.0,13.0,200500039402A01,15,17,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30867,202100042976,2021-04-24,2021,avril,samedi,07:45,13,13032,435779040000,53365450000,...,1,2,0.0,2.0,1.0,1.0,202100042976B01,7,17,1.0
30868,202100042976,2021-04-24,2021,avril,samedi,07:45,13,13032,435779040000,53365450000,...,1,2,0.0,2.0,0.0,1.0,202100042976C01,7,17,1.0
31033,202100045502,2021-04-02,2021,avril,vendredi,12:25,13,13015,434573580000,54130790000,...,1,2,0.0,2.0,1.0,1.0,202100045502B01,7,15,1.0
31228,202100048088,2021-03-15,2021,mars,lundi,08:50,13,13100,437922950000,48394720000,...,2,,0.0,0.0,4.0,19.0,202100048088B01,7,1,1.0


In [None]:
fig = px.scatter_mapbox(
    mars_df,
    lat='lat',
    lon='long',
    height=600,
    zoom=12,
    color=mars_df['grav'],
    mapbox_style="carto-positron",
)

fig.update_layout(
    margin={"r":10,"t":10,"l":10,"b":10}  # Marges droite, haut, gauche, bas à 0
)

fig.show()


TypeError: unsupported operand type(s) for +: 'float' and 'str'