In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import folium
from folium.plugins import HeatMap
import requests
import geopandas as gpd
import yaml


### Clean Pistes


In [None]:
def get_datasets_url(url):
    response = requests.get(url).json()
    return {el['title']: el['latest'] for el in response['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules")}


def download_and_save_datasets(url_dict, save_path):
    for path, url in url_dict.items():
        full_path = os.path.join(save_path, path)
        if not os.path.exists(full_path):
            response = requests.get(url)
            if response.status_code == 200:
                with open(full_path, 'wb') as f:
                    f.write(response.content)


def rename_files(config_path, save_path):
    with open(config_path, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
        rename_config = config.get('rename')

    for old_name, new_name in rename_config.items():
        old_file_path = os.path.join(save_path, old_name)
        new_file_path = os.path.join(save_path, new_name)

        if os.path.exists(old_file_path):
            os.rename(old_file_path, new_file_path)
            print(f"Fichier renommé : {old_name} -> {new_name}")
        else:
            print(f"Fichier non trouvé : {old_name}")


In [None]:
url = 'https://transport.data.gouv.fr/api/datasets/60a37b7f303fdf4f2654b73d'


In [162]:
fichier_geojson = '/Users/aurelienbrame/code/CyclingFacilities/raw_data/Pistes/france-20220710.geojson'

data_pistes = gpd.read_file(fichier_geojson)


In [197]:
"""Drop colonnes non-pertinentes"""

data_pistes_clean = data_pistes.drop(['num_iti', 'largeur_d', 'local_d', 'largeur_g', 'local_g', 'access_ame', 'statut_d', 'd_service'], axis=1, inplace=False)


In [198]:
"""Drop Na colonne code_com_d"""

data_pistes_clean = data_pistes_clean.dropna(subset=['code_com_d'])


In [199]:
"""Drop Na colonne regime_d"""

data_pistes_clean = data_pistes_clean.dropna(subset=['regime_d'])


In [201]:
"""Drop Na colonne sens_d"""

data_pistes_clean = data_pistes_clean.dropna(subset=['sens_d'])


In [207]:
"""Drop Na colonne ame_d"""

data_pistes_clean = data_pistes_clean.dropna(subset=['ame_d'])


In [212]:
"""Drop Na colonne code_com_g"""

data_pistes_clean = data_pistes_clean.dropna(subset=['code_com_g'])


In [219]:
"""Cleaning colonne lumiere"""

values_distribution_lumiere = data_pistes_clean.lumiere.value_counts(normalize=True)
new_values_lumiere = np.random.choice(values_distribution_lumiere.index, size = data_pistes_clean['lumiere'].isna().sum(), p=values_distribution_lumiere.values)
data_pistes_clean.loc[data_pistes_clean.lumiere.isna(), 'lumiere'] = new_values_lumiere


In [224]:
"""Cleaning colonne revet_d"""

values_distribution_revet = data_pistes_clean.revet_d.value_counts(normalize=True)
new_values_revet = np.random.choice(values_distribution_revet.index, size = data_pistes_clean['revet_d'].isna().sum(), p=values_distribution_revet.values)
data_pistes_clean.loc[data_pistes_clean.revet_d.isna(), 'revet_d'] = new_values_revet


In [229]:
"""Cleaning colonne revet_g"""

values_distribution_revetg = data_pistes_clean.revet_g.value_counts(normalize=True)
new_values_revetg = np.random.choice(values_distribution_revetg.index, size = data_pistes_clean['revet_g'].isna().sum(), p=values_distribution_revetg.values)
data_pistes_clean.loc[data_pistes_clean.revet_g.isna(), 'revet_g'] = new_values_revetg


In [234]:
"""Cleaning colonne regime_g"""

values_distribution_regimeg = data_pistes_clean.regime_g.value_counts(normalize=True)
new_values_regimeg = np.random.choice(values_distribution_regimeg.index, size = data_pistes_clean['regime_g'].isna().sum(), p=values_distribution_regimeg.values)
data_pistes_clean.loc[data_pistes_clean.regime_g.isna(), 'regime_g'] = new_values_regimeg


In [238]:
"""Cleaning colonne sens_g"""

values_distribution_sensg = data_pistes_clean.sens_g.value_counts(normalize=True)
new_values_sensg = np.random.choice(values_distribution_sensg.index, size = data_pistes_clean['sens_g'].isna().sum(), p=values_distribution_sensg.values)
data_pistes_clean.loc[data_pistes_clean.sens_g.isna(), 'sens_g'] = new_values_sensg


In [242]:
"""Cleaning colonne trafic_vit"""

values_distribution_trafic = data_pistes_clean.trafic_vit.value_counts(normalize=True)
new_values_trafic = np.random.choice(values_distribution_trafic.index, size = data_pistes_clean['trafic_vit'].isna().sum(), p=values_distribution_trafic.values)
data_pistes_clean.loc[data_pistes_clean.trafic_vit.isna(), 'trafic_vit'] = new_values_trafic


In [257]:
data_pistes_clean.head(2)


Unnamed: 0,id_local,id_osm,code_com_d,ame_d,regime_d,sens_d,revet_d,code_com_g,ame_g,regime_g,sens_g,statut_g,revet_g,date_maj,trafic_vit,lumiere,source,project_c,ref_geo,geometry
0,geovelo_377971705_31254,377971705,31254,PISTE CYCLABLE,AUTRE,UNIDIRECTIONNEL,LISSE,31254,PISTE CYCLABLE,AUTRE,UNIDIRECTIONNEL,EN SERVICE,LISSE,2020-08-10,5.0,True,Les contributeurs OpenStreetmap,4326,OpenStreetmap,"LINESTRING (1.52869 43.53102, 1.52760 43.53166)"
1,geovelo_568517126_24098,568517126,24098,PISTE CYCLABLE,AUTRE,UNIDIRECTIONNEL,LISSE,24098,PISTE CYCLABLE,AUTRE,UNIDIRECTIONNEL,EN SERVICE,LISSE,2020-06-04,5.0,True,Les contributeurs OpenStreetmap,4326,OpenStreetmap,"LINESTRING (0.72652 45.21236, 0.72615 45.21294..."


In [258]:
for col in data_pistes_clean.columns:
    print(data_pistes_clean[col].value_counts())


geovelo_42721215_49092      129
geovelo_42721215_49109      129
geovelo_41393974_17486       64
geovelo_41393974_17323       64
geovelo_177322375_03321      22
                           ... 
geovelo_1001111492_94018      1
geovelo_647877713_94041       1
geovelo_32460795_94018        1
geovelo_537250704_94018       1
geovelo_80717300_29190        1
Name: id_local, Length: 209943, dtype: int64
42721215     258
41393974     128
177322375     43
220089072     39
545866356     30
            ... 
925602466      1
219838757      1
714728902      1
664944592      1
80717300       1
Name: id_osm, Length: 204456, dtype: int64
31555    3897
44109    3196
35238    2153
67482    2046
38185    1816
         ... 
84063       1
67369       1
57064       1
74180       1
29197       1
Name: code_com_d, Length: 10632, dtype: int64
PISTE CYCLABLE                                   78001
VOIE VERTE                                       42250
BANDE CYCLABLE                                   34449
AUCUN   

In [260]:
data_pistes_clean.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 221411 entries, 0 to 259029
Data columns (total 20 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   id_local    221411 non-null  object  
 1   id_osm      221411 non-null  object  
 2   code_com_d  221411 non-null  object  
 3   ame_d       221411 non-null  object  
 4   regime_d    221411 non-null  object  
 5   sens_d      221411 non-null  object  
 6   revet_d     221411 non-null  object  
 7   code_com_g  221411 non-null  object  
 8   ame_g       221411 non-null  object  
 9   regime_g    221411 non-null  object  
 10  sens_g      221411 non-null  object  
 11  statut_g    221411 non-null  object  
 12  revet_g     221411 non-null  object  
 13  date_maj    221411 non-null  object  
 14  trafic_vit  221411 non-null  float64 
 15  lumiere     221411 non-null  object  
 16  source      221411 non-null  object  
 17  project_c   221411 non-null  object  
 18  ref_geo     2214

### Data Flux

In [30]:
fichier_compteurs = '/Users/aurelienbrame/code/CyclingFacilities/raw_data/Flux/comptage-velo-compteurs.geojson'
fichier_compteurs_data = '/Users/aurelienbrame/code/CyclingFacilities/raw_data/Flux/comptage-velo-donnees-compteurs.geojson'


In [31]:
data_flux_compteurs = gpd.read_file(fichier_compteurs)


In [33]:
data_flux_compteurs_data = gpd.read_file(fichier_compteurs_data)


In [34]:
data_flux_compteurs_data.head()


Unnamed: 0,id_compteur,nom_compteur,id,name,sum_counts,date,installation_date,url_photos_n1,coordinates,counter,photos,test_lien_vers_photos_du_site_de_comptage_,id_photo_1,url_sites,type_dimage,mois_annee_comptage,geometry
0,100003096-353242251,97 avenue Denfert Rochereau SO-NE,100003096,97 avenue Denfert Rochereau,3,2022-11-01 05:00:00+00:00,2012-02-22,https://filer.eco-counter-tools.com/file/10/6d...,,X2H20012081,https://filer.eco-counter-tools.com/file/10/6d...,https://filer.eco-counter-tools.com/file/10/6d...,https:,https://www.eco-visio.net/Photos/100003096,jpg,2022-11,POINT (2.33314 48.83504)
1,100003096-353242251,97 avenue Denfert Rochereau SO-NE,100003096,97 avenue Denfert Rochereau,2,2022-11-01 03:00:00+00:00,2012-02-22,https://filer.eco-counter-tools.com/file/10/6d...,,X2H20012081,https://filer.eco-counter-tools.com/file/10/6d...,https://filer.eco-counter-tools.com/file/10/6d...,https:,https://www.eco-visio.net/Photos/100003096,jpg,2022-11,POINT (2.33314 48.83504)
2,100003096-353242251,97 avenue Denfert Rochereau SO-NE,100003096,97 avenue Denfert Rochereau,5,2022-11-01 04:00:00+00:00,2012-02-22,https://filer.eco-counter-tools.com/file/10/6d...,,X2H20012081,https://filer.eco-counter-tools.com/file/10/6d...,https://filer.eco-counter-tools.com/file/10/6d...,https:,https://www.eco-visio.net/Photos/100003096,jpg,2022-11,POINT (2.33314 48.83504)
3,100003096-353242251,97 avenue Denfert Rochereau SO-NE,100003096,97 avenue Denfert Rochereau,8,2022-11-01 06:00:00+00:00,2012-02-22,https://filer.eco-counter-tools.com/file/10/6d...,,X2H20012081,https://filer.eco-counter-tools.com/file/10/6d...,https://filer.eco-counter-tools.com/file/10/6d...,https:,https://www.eco-visio.net/Photos/100003096,jpg,2022-11,POINT (2.33314 48.83504)
4,100003096-353242251,97 avenue Denfert Rochereau SO-NE,100003096,97 avenue Denfert Rochereau,60,2022-11-01 08:00:00+00:00,2012-02-22,https://filer.eco-counter-tools.com/file/10/6d...,,X2H20012081,https://filer.eco-counter-tools.com/file/10/6d...,https://filer.eco-counter-tools.com/file/10/6d...,https:,https://www.eco-visio.net/Photos/100003096,jpg,2022-11,POINT (2.33314 48.83504)


In [36]:
data_flux_compteurs_data.shape


(926954, 17)

In [38]:
data_flux_compteurs_data.columns


Index(['id_compteur', 'nom_compteur', 'id', 'name', 'sum_counts', 'date',
       'installation_date', 'url_photos_n1', 'coordinates', 'counter',
       'photos', 'test_lien_vers_photos_du_site_de_comptage_', 'id_photo_1',
       'url_sites', 'type_dimage', 'mois_annee_comptage', 'geometry'],
      dtype='object')

In [None]:
data_flux_compteurs_data.drop('url_photos_n1', axis=1, inplace=True)


In [40]:
data_flux_compteurs_data.drop('type_dimage', axis=1, inplace=True)


In [47]:
data_flux_compteurs_data.drop('photos', axis=1, inplace=True)


In [49]:
data_flux_compteurs_data.drop('id_photo_1', axis=1, inplace=True)


In [54]:
data_flux_compteurs_data.drop('test_lien_vers_photos_du_site_de_comptage_', axis=1, inplace=True)


In [56]:
data_flux_compteurs_data.drop('url_sites', axis=1, inplace=True)


In [95]:
data_flux_compteurs_data.drop('coordinates', axis=1, inplace=True)


In [97]:
data_flux_compteurs_data.drop('name', axis=1, inplace=True)


In [99]:
data_flux_compteurs_data.drop('id', axis=1, inplace=True)


In [100]:
data_flux_compteurs_data.head()


Unnamed: 0,id_compteur,nom_compteur,sum_counts,date,installation_date,counter,mois_annee_comptage,geometry
0,100003096-353242251,97 avenue Denfert Rochereau SO-NE,3,2022-11-01 05:00:00+00:00,2012-02-22,X2H20012081,2022-11,POINT (2.33314 48.83504)
1,100003096-353242251,97 avenue Denfert Rochereau SO-NE,2,2022-11-01 03:00:00+00:00,2012-02-22,X2H20012081,2022-11,POINT (2.33314 48.83504)
2,100003096-353242251,97 avenue Denfert Rochereau SO-NE,5,2022-11-01 04:00:00+00:00,2012-02-22,X2H20012081,2022-11,POINT (2.33314 48.83504)
3,100003096-353242251,97 avenue Denfert Rochereau SO-NE,8,2022-11-01 06:00:00+00:00,2012-02-22,X2H20012081,2022-11,POINT (2.33314 48.83504)
4,100003096-353242251,97 avenue Denfert Rochereau SO-NE,60,2022-11-01 08:00:00+00:00,2012-02-22,X2H20012081,2022-11,POINT (2.33314 48.83504)


In [102]:
data_flux_months = pd.to_datetime(data_flux_compteurs_data['date'])
counts_by_month = data_flux_compteurs_data.groupby(data_flux_compteurs_data['date'].dt.to_period('M')).sum()
counts_by_month




Unnamed: 0_level_0,sum_counts
date,Unnamed: 1_level_1
2022-03,0
2022-04,0
2022-07,0
2022-08,5
2022-11,4828098
2022-12,3290438
2023-01,4363337
2023-02,4778604
2023-03,5546875
2023-04,5162509


### Test loop data 

In [39]:
# Call API Data Accidents
def get_datasets_url(url=None):
    url='https://www.data.gouv.fr/api/1/datasets/53698f4ca3a729239d2036df/'
    r = requests.get(url).json()
    return {el['title']: el['latest'] for el in r['resources'] if el['title'].endswith(".csv") and not el['title'].startswith("vehicules-immatricules") }

lieux_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("lieux")}
usagers_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("usagers")}
car_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("car")}
vehicule_datasets = {i:j for i,j in get_datasets_url().items() if i.startswith("vehicule")}

all_urls = [lieux_datasets, usagers_datasets, car_datasets,vehicule_datasets]

### download csvs if not already

for url_dict in all_urls:
    for file_name, url in url_dict.items():
        file_name = '../raw_data/' + file_name
        if not os.path.exists(file_name):
            response = requests.get(url)
            if response.status_code == 200:
                with open(file_name, 'wb') as f:
                    f.write(response.content)

### check datasets

folder = "../raw_data/"
os.listdir(folder)


['usagers_2006.csv',
 'usagers_2012.csv',
 'usagers_2013.csv',
 'usagers_2007.csv',
 'lieux_2016.csv',
 'lieux_2014.csv',
 'usagers_2011.csv',
 'usagers_2005.csv',
 'usagers_2010.csv',
 'lieux_2015.csv',
 'lieux_2011.csv',
 'lieux_2005.csv',
 'usagers_2014.csv',
 '.DS_Store',
 'usagers_2015.csv',
 'lieux_2010.csv',
 'lieux_2006.csv',
 'lieux_2012.csv',
 'vehicules_2009.csv',
 'caracteristiques-2017.csv',
 'Flux',
 'usagers_2016.csv',
 'vehicules_2008.csv',
 'lieux_2013.csv',
 'lieux_2007.csv',
 'lieux-2021.csv',
 'usagers-2018.csv',
 'Accidents',
 'usagers-2019.csv',
 'lieux-2020.csv',
 'lieux-2022.csv',
 'usagers-2022.csv',
 'vehicules-2017.csv',
 'lieux-2018.csv',
 'usagers-2021.csv',
 'caracteristiques_2009.csv',
 'caracteristiques_2008.csv',
 'usagers-2020.csv',
 'lieux-2019.csv',
 'caracteristiques_2011.csv',
 'caracteristiques_2005.csv',
 'caracteristiques_2010.csv',
 'vehicules-2018.csv',
 'lieux-2017.csv',
 'caracteristiques_2006.csv',
 'caracteristiques_2012.csv',
 'caracteris

In [40]:
chemin_fichier_yml = '../config.yml'
chemin_dossier = '../raw_data'


with open (chemin_fichier_yml, 'r') as f:
    config = yaml.safe_load(f)
    rename_config = config.get('rename')

for old_name, new_name in rename_config.items():
    chemin_ancien_fichier = os.path.join(chemin_dossier, old_name)
    chemin_nouveau_fichier = os.path.join(chemin_dossier, new_name)

    if os.path.exists(chemin_ancien_fichier):
        os.rename(chemin_ancien_fichier, chemin_nouveau_fichier)
        print(f"Fichier renommé : {old_name} -> {new_name}")
    else:
        print(f"Fichier non trouvé : {old_name}")


Fichier renommé : caracteristiques-2017.csv -> caracteristiques_2017.csv
Fichier renommé : caracteristiques-2018.csv -> caracteristiques_2018.csv
Fichier renommé : caracteristiques-2019.csv -> caracteristiques_2019.csv
Fichier renommé : caracteristiques-2020.csv -> caracteristiques_2020.csv
Fichier renommé : carcteristiques-2021.csv -> caracteristiques_2021.csv
Fichier renommé : carcteristiques-2022.csv -> caracteristiques_2022.csv
Fichier renommé : lieux-2017.csv -> lieux_2017.csv
Fichier renommé : lieux-2018.csv -> lieux_2018.csv
Fichier renommé : lieux-2019.csv -> lieux_2019.csv
Fichier renommé : lieux-2020.csv -> lieux_2020.csv
Fichier renommé : lieux-2021.csv -> lieux_2021.csv
Fichier renommé : lieux-2022.csv -> lieux_2022.csv
Fichier renommé : usagers-2017.csv -> usagers_2017.csv
Fichier renommé : usagers-2018.csv -> usagers_2018.csv
Fichier renommé : usagers-2019.csv -> usagers_2019.csv
Fichier renommé : usagers-2020.csv -> usagers_2020.csv
Fichier renommé : usagers-2021.csv -> 

In [6]:
# Passer par fonction et var env
def concat_files(starting_word):

    chemin_fichier_yml = '../config.yml'
    with open(chemin_fichier_yml, 'r') as f:
        config = yaml.safe_load(f)
        config_sep = config.get('sep')
        config_encoding = config.get('encoding')

    chemin_dossier = '../raw_data/'

    df_concat = pd.DataFrame()
    files = [file for file in os.listdir(chemin_dossier) if file.endswith('.csv') and file.startswith(starting_word)]

    print(files)
    for file in files:
        chemin_fichier = os.path.join(chemin_dossier, file)

        if file in config_sep:
            sep = config_sep[file]
        else:
            sep = ','

        if file in config_encoding:
            encoding = config_encoding[file]
        else:
            encoding = 'utf-8'

        df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)

        df_concat = pd.concat([df_concat, df1])

    return df_concat

#utiliser yield ?


In [7]:
carac_df = concat_files("caracteristiques")
lieux_df = concat_files("lieux")
usager_df = concat_files("usagers")
vehi_df = concat_files("vehicules")

# vehi_df.to_csv('../raw_data/vehi.csv', index=False)
# carac_df.to_csv('../raw_data/carac.csv', index=False)
# lieux_df.to_csv('../raw_data/lieux.csv', index=False)
# usager_df.to_csv('../raw_data/usagers.csv', index=False)


['caracteristiques_2018.csv', 'caracteristiques_2019.csv', 'caracteristiques_2022.csv', 'caracteristiques_2009.csv', 'caracteristiques_2021.csv', 'caracteristiques_2020.csv', 'caracteristiques_2008.csv', 'caracteristiques_2011.csv', 'caracteristiques_2005.csv', 'caracteristiques_2010.csv', 'caracteristiques_2006.csv', 'caracteristiques_2012.csv', 'caracteristiques_2013.csv', 'caracteristiques_2007.csv', 'caracteristiques_2017.csv', 'caracteristiques_2016.csv', 'caracteristiques_2014.csv', 'caracteristiques_2015.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['lieux_2017.csv', 'lieux_2016.csv', 'lieux_2014.csv', 'lieux_2015.csv', 'lieux_2011.csv', 'lieux_2005.csv', 'lieux_2010.csv', 'lieux_2006.csv', 'lieux_2012.csv', 'lieux.csv', 'lieux_2013.csv', 'lieux_2007.csv', 'lieux_2022.csv', 'lieux_2009.csv', 'lieux_2021.csv', 'lieux_2020.csv', 'lieux_2008.csv', 'lieux_2018.csv', 'lieux_2019.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)
  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['usagers_2006.csv', 'usagers_2012.csv', 'usagers_2013.csv', 'usagers_2007.csv', 'usagers_2011.csv', 'usagers_2005.csv', 'usagers_2010.csv', 'usagers.csv', 'usagers_2014.csv', 'usagers_2015.csv', 'usagers_2017.csv', 'usagers_2016.csv', 'usagers_2018.csv', 'usagers_2019.csv', 'usagers_2021.csv', 'usagers_2009.csv', 'usagers_2008.csv', 'usagers_2020.csv', 'usagers_2022.csv']


  df1 = pd.read_csv(chemin_fichier, sep=sep, encoding=encoding)


['vehicules_2018.csv', 'vehicules_2019.csv', 'vehicules_2022.csv', 'vehicules_2009.csv', 'vehicules_2021.csv', 'vehicules_2020.csv', 'vehicules_2008.csv', 'vehicules_2005.csv', 'vehicules_2011.csv', 'vehicules_2010.csv', 'vehicules_2012.csv', 'vehicules_2006.csv', 'vehicules_2007.csv', 'vehicules_2013.csv', 'vehicules_2017.csv', 'vehicules_2016.csv', 'vehicules_2014.csv', 'vehicules_2015.csv']


### DF Vehicules


In [111]:
"""Drop columns occutc, num_veh, motor, id_vehicule, senc"""

vehi_df_modif = vehi_df.drop(['occutc','motor', 'id_vehicule','senc'], axis=1, inplace=False)


In [55]:
vehi_df_modif.head()


Unnamed: 0,Num_Acc,catv,obs,obsm,choc,manv,num_veh
0,201800000001,7,0.0,2.0,3.0,1.0,B01
1,201800000001,7,0.0,2.0,2.0,15.0,A01
2,201800000002,7,0.0,1.0,0.0,1.0,A01
3,201800000003,33,1.0,2.0,1.0,1.0,A01
4,201800000003,7,0.0,2.0,8.0,15.0,B01


In [6]:
# id = vehi_df_modif[vehi_df_modif['catv'] == 1].Num_Acc.values
# id_acc_bikes = vehi_df_modif[vehi_df_modif.Num_Acc.isin(id)]
# id_acc_bikes[id_acc_bikes['catv'] !=1]


Unnamed: 0,Num_Acc,catv,obs,obsm,choc,manv
7,201800000004,7,0.0,9.0,2.0,21.0
29,201800000020,7,12.0,2.0,3.0,2.0
70,201800000044,7,0.0,9.0,2.0,17.0
80,201800000051,7,0.0,9.0,1.0,0.0
90,201800000056,7,0.0,9.0,2.0,2.0
...,...,...,...,...,...,...
99576,201500058524,30,0.0,9.0,1.0,19.0
99706,201500058611,7,16.0,0.0,3.0,0.0
99715,201500058616,15,0.0,9.0,2.0,14.0
99739,201500058629,7,0.0,2.0,6.0,0.0


In [60]:
"""Clean the column 'obs'"""

values_destribution_obs = vehi_df_modif.obs[vehi_df_modif.obs >= 0].value_counts(normalize=True)
new_values_obs = np.random.choice(values_destribution_obs.index, size = vehi_df_modif['obs'].isna().sum(), p=values_destribution_obs.values)
vehi_df_modif.loc[vehi_df_modif['obs'].isna(), 'obs'] = new_values_obs
new_values_obs2 = np.random.choice(values_destribution_obs.index, size = (vehi_df_modif['obs'] == -1).sum(), p=values_destribution_obs.values)
vehi_df_modif.loc[vehi_df_modif['obs'] == -1, 'obs'] = new_values_obs2


In [61]:
"""Cleaning the column 'obsm'"""

values_destribution_obsm = vehi_df_modif.obsm[vehi_df_modif.obsm >= 0].value_counts(normalize=True)
new_values_obsm = np.random.choice(values_destribution_obsm.index, size = vehi_df_modif['obsm'].isna().sum(), p=values_destribution_obsm.values)
vehi_df_modif.loc[vehi_df_modif['obsm'].isna(), 'obsm'] = new_values_obsm
new_values_obsm2 = np.random.choice(values_destribution_obsm.index, size = (vehi_df_modif['obsm'] == -1).sum(), p=values_destribution_obsm.values)
vehi_df_modif.loc[vehi_df_modif['obsm'] == -1, 'obsm'] = new_values_obsm2


In [62]:
"""Clean the column 'choc'"""

values_destribution_choc = vehi_df_modif.choc[vehi_df_modif.choc > 0].value_counts(normalize=True)
new_values_choc = np.random.choice(values_destribution_choc.index, size = vehi_df_modif['choc'].isna().sum(), p=values_destribution_choc.values)
vehi_df_modif.loc[vehi_df_modif['choc'].isna(), 'choc'] = new_values_choc
new_values_choc2 = np.random.choice(values_destribution_choc.index, size = (vehi_df_modif['choc'] == -1).sum(), p=values_destribution_choc.values)
vehi_df_modif.loc[vehi_df_modif['choc'] == -1, 'choc'] = new_values_choc2
new_values_choc3 = np.random.choice(values_destribution_choc.index, size = (vehi_df_modif['choc'] == 0).sum(), p=values_destribution_choc.values)
vehi_df_modif.loc[vehi_df_modif['choc'] == 0, 'choc'] = new_values_choc3


In [63]:
"""Clean the column 'manv'"""""

values_destribution_manv = vehi_df_modif.manv[vehi_df_modif.choc > 0].value_counts(normalize=True)
new_values_manv = np.random.choice(values_destribution_manv.index, size = vehi_df_modif['manv'].isna().sum(), p=values_destribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'].isna(), 'manv'] = new_values_manv
new_values_manv2 = np.random.choice(values_destribution_manv.index, size = (vehi_df_modif['manv'] == -1).sum(), p=values_destribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'] == -1, 'manv'] = new_values_manv2
new_values_manv3 = np.random.choice(values_destribution_manv.index, size = (vehi_df_modif['manv'] == 0).sum(), p=values_destribution_manv.values)
vehi_df_modif.loc[vehi_df_modif['manv'] == 0, 'manv'] = new_values_manv3


In [64]:
vehi_df_modif.to_csv('../clean_data/vehicules_clean.csv', index=False)


(1176873, 17)

2.022000e+11    1
2.022000e+11    1
2.022000e+11    1
2.022000e+11    1
2.022000e+11    1
               ..
2.022000e+11    1
2.022000e+11    1
2.022000e+11    1
2.022000e+11    1
2.022001e+11    1
Name: Accident_Id, Length: 55302, dtype: int64

In [121]:
carac_df_modif = carac_df.drop(['gps', 'Accident_Id'], axis=1, inplace=False)


In [122]:
carac_df_modif.head()


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,lat,long,dep
0,201800000000.0,18,1,24,1505,1,1,4,1.0,1.0,5,route des Ansereuilles,5055737.0,294992.0,590
1,201800000000.0,18,2,12,1015,1,2,7,7.0,7.0,11,Place du général de Gaul,5052936.0,293151.0,590
2,201800000000.0,18,3,4,1135,1,2,3,1.0,7.0,477,Rue nationale,5051243.0,291714.0,590
3,201800000000.0,18,5,5,1735,1,2,1,7.0,3.0,52,30 rue Jules Guesde,5051974.0,289123.0,590
4,201800000000.0,18,6,26,1605,1,2,1,1.0,3.0,477,72 rue Victor Hugo,5051607.0,290605.0,590


In [132]:
carac_df_modif


Unnamed: 0,Num_Acc,lum,agg,int,atm,col,com,adr,lat,long,dep,date
0,2.018000e+11,1,1,4,1.0,1.0,5,route des Ansereuilles,5055737.0,294992.0,590,2018-01-24 15:05:00
1,2.018000e+11,1,2,7,7.0,7.0,11,Place du général de Gaul,5052936.0,293151.0,590,2018-02-12 10:15:00
2,2.018000e+11,1,2,3,1.0,7.0,477,Rue nationale,5051243.0,291714.0,590,2018-03-04 11:35:00
3,2.018000e+11,1,2,1,7.0,3.0,52,30 rue Jules Guesde,5051974.0,289123.0,590,2018-05-05 17:35:00
4,2.018000e+11,1,2,1,1.0,3.0,477,72 rue Victor Hugo,5051607.0,290605.0,590,2018-06-26 16:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...
58649,2.015001e+11,1,2,1,1.0,7.0,611,kaweni-Kaweni nord ( Z I,0.0,0.0,976,2015-11-14 17:14:00
58650,2.015001e+11,1,2,1,1.0,6.0,611,m'tsapere-Stade (Bouleva,0.0,0.0,976,2015-11-12 07:30:00
58651,2.015001e+11,1,2,1,1.0,1.0,611,kaweni-Kaweni sud (Z I),0.0,0.0,976,2015-11-10 15:08:00
58652,2.015001e+11,1,2,1,1.0,6.0,611,m'tsapere-Roahandra ( ru,0.0,0.0,976,2015-11-15 16:30:00


In [127]:
"""Cleaning date's column"""

def process_dates(df):
    # Vérifier les dates invalides
    invalid_dates = df[(df['jour'] > 31) | ((df['mois'] == 2) & (df['jour'] > 29)) | ((df['mois'].isin([4, 6, 9, 11])) & (df['jour'] > 30))]
    df = df.drop(invalid_dates.index)

    # Créer la colonne 'date'
    df['date'] = pd.to_datetime(df['an'].astype(str).str.zfill(2)
                                + df['mois'].astype(str).str.zfill(2)
                                + df['jour'].astype(str).str.zfill(2)
                                + df['hrmn'].astype(str).str.zfill(4),
                                format='%y%m%d%H%M', errors='coerce')

    df = df.drop(columns=['an', 'mois', 'jour', 'hrmn'])
    return df

# Utiliser la fonction sur carac_df_modif
carac_df_modif = process_dates(carac_df_modif)


In [86]:
"""Drop Na de Num_Acc"""

carac_df_modif = carac_df_modif.dropna(subset=['Num_Acc'])


In [87]:
"""Clean the column 'lum'"""

values_destribution_lum = carac_df_modif.lum[carac_df_modif['lum'] >= 0].value_counts(normalize=True)
new_values_lum = np.random.choice(values_destribution_lum.index, size = (carac_df_modif['lum'] == -1).sum(), p=values_destribution_lum.values)
carac_df_modif.loc[carac_df_modif['lum'] == -1, 'lum'] = new_values_lum


In [88]:
"""Delete wrong values"""

carac_df_modif = carac_df_modif.loc[carac_df_modif['int'] != -1]


In [89]:
"""Clean the column 'col'"""

carac_df_modif = carac_df_modif.dropna(subset=['col'])
most_frequent = carac_df_modif['col'].mode()[0]
carac_df_modif['col'] = carac_df_modif['col'].replace(-1, most_frequent)


In [90]:
"""Clean the column 'atm'"""

carac_df_modif = carac_df_modif.dropna(subset=['atm'])
most_frequent = carac_df_modif['atm'].mode()[0]
carac_df_modif['atm'] = carac_df_modif['atm'].replace(-1, most_frequent)


In [94]:
"""Clean the column 'com'"""

carac_df_modif = carac_df_modif.dropna(subset=['com'])


In [116]:
carac_df_modif.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1176873 entries, 0 to 58653
Data columns (total 15 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   Num_Acc  1121571 non-null  float64
 1   an       1176873 non-null  int64  
 2   mois     1176873 non-null  int64  
 3   jour     1176873 non-null  int64  
 4   hrmn     1176873 non-null  object 
 5   lum      1176873 non-null  int64  
 6   agg      1176873 non-null  int64  
 7   int      1176873 non-null  int64  
 8   atm      1176800 non-null  float64
 9   col      1176854 non-null  float64
 10  com      1176871 non-null  object 
 11  adr      1032364 non-null  object 
 12  lat      689805 non-null   object 
 13  long     689801 non-null   object 
 14  dep      1176873 non-null  object 
dtypes: float64(3), int64(6), object(6)
memory usage: 143.7+ MB


In [97]:
carac_df_modif['long'].value_counts(), carac_df_modif['long'].isna().sum(), carac_df_modif.shape


(0.0             107361
 0                30279
 -                 3208
 345699.0           162
 228547.0           131
                  ...  
 6,7134970000         1
 1,4868854932         1
 5,8189990000         1
 4,8354970000         1
 5548925.0            1
 Name: long, Length: 363784, dtype: int64,
 487021,
 (1121482, 15))

In [134]:
carac_df_modif.to_csv('../clean_data/caracteristiques_clean.csv', index=False)
