# Exploration space
## Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


In [None]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 51)
pd.set_option('display.width', 1000)

## (Down)Load data

In [None]:

from loader import download_data
url_mtp = "https://data.montpellier3m.fr/sites/default/files/ressources/MMM_MMM_DAE.csv"


data = download_data(url_mtp)

In [None]:
data.dropna(
    axis='columns',
    how='all',
    inplace=True)
data

Unnamed: 0,nom,lat_coor1,x,long_coor1,y,adr_num,adr_voie,com_cp,com_insee,com_nom,acc,acc_lib,acc_pcsec,acc_acc,acc_etg,acc_complt,photo1,photo2,disp_j,disp_h,disp_compl,tel1,tel2,site_email,date_insta,etat_fonct,fab_siren,fab_rais,mnt_siren,mnt_rais,modele,num_serie,id_euro,lc_ped,dtpr_lcped,dtpr_lcad,dtpr_bat,freq_mnt,dispsurv,dermnt,expt_siren,expt_rais,expt_tel1,expt_tel2,expt_email,ref,id,appartenan,dae_mobile
0,"Plateau sportif de GrammontTerrain 9, 10, 11",3.93392108647369,775412.3055,43.6136351580956,6.279844e+06,,avenue albert Einstein,34000,34172,Montpellier,intérieur,non,non,oui,0,,,,"lundi, mardi, mercredi, jeudi, vendredi",Heures ouvrables,-,334 67 64 87 70,,,2017-12-1,En fonctionnement,,ZOLL,,,AED PLUS,X17I955734,,oui,2021-02-02,2023-01-14,2022-12-1,tous les ans,,2019-05-15,213401722,personne morale,334 67 34 70 00,,mairie@ville-montpellier.fr,GARCIA Serge,22,Mairie,non
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,0.0000,,0.000000e+00,,,0,0,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Métropole,
2,MEDIATHEQUE William SHAKESPEARE,,0.0000,,0.000000e+00,,,0,0,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Métropole,
3,MIBI,,0.0000,,0.000000e+00,,,0,0,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Métropole,
4,MAISON POUR TOUS LEO LAGRANGE,,0.0000,,0.000000e+00,155,Rue de Bologne,34172,34080,Montpellier,intérieur,oui,non,oui,0,,,,"lundi, mardi, mercredi, jeudi, vendredi",Heures ouvrables,,+334 67 40 33 57,,,2020-10-06,En fonctionnement,,ZOLL,,,AED PLUS,X19J215834,,,2021-08-17,2024-12-28,2025-10-06,Tous les ans,,2020-10-06,213401722,personne morale,+334 67 34 70 00,,mairie@ville-montpellier.fr,,0,Mairie,non
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,Zoo de Lunaret,3.87346840734586,770497.5034,43.6409437259243,6.282823e+06,,avenue agropolis,34090,34172,Montpellier,intérieur,non,non,non,0,Réfectoire personnel,,,"mardi, mercredi, jeudi, vendredi, samedi, dima...",Heures ouvrables,,334 67 54 45 20,,,2013-09-05,En fonctionnement,,ZOLL,,,AED PLUS,X13G616857,,,2020-10-01,2023-09-09,2022-12-01,Tous les ans,,2019-05-14,213401722,personne morale,334 67 34 70 00,,mairie@ville-montpellier.fr,CORNIER,0,Mairie,non
271,Gymnase Jean Bouin,3.82081921967322,766257.7753,43.6327880433471,6.281871e+06,-,Avenue du Biterrois,34080,34172,Montpellier,interieur,non,non,non,0,,,,"lundi, mardi, mercredi, jeudi, vendredi",Heures ouvrables,,334 67 75 44 43,,,2019-11-01,En fonctionnement,,ZOLL,,,AED PLUS,X19J212182,,,2021-10-26,2024-11-03,2024-11-01,Tous les ans,,2019-11-01,213401722,personne morale,334 67 34 70 00,,mairie@ville-montpellier.fr,,0,Mairie,non
272,Centre Culturel Rabelais,3.88032026773112,771087.9776,43.6106902122358,6.279467e+06,29,boulevard Sarrail,34000,34172,Montpellier,intérieur,non,non,oui,0,,,,"lundi, mardi, mercredi, jeudi, vendredi",Heures ouvrables,,334 67 34 71 33,,,2019-12-01,En fonctionnement,,ZOLL,,,AED PLUS,X19J215337,,,2021-08-17,2024-12-28,2024-12-01,2019-12-01,,Tous les ans,213401722,personne morale,334 67 34 70 00,,mairie@ville-montpellier.fr,,76,Mairie,non
273,,3.86982233698134,770237.7162,43.6127835439949,6.279690e+06,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,34172,Montpellier,intérieur,non,non,non,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,


## Keeping only columns of interest

We want to build a new dataframe containing:
- ID (the dataframe's index)
- Name
- Adress (including postal code (com_cp) and city name (com_nom))
- Contact phone number
- Maintenance frequency
- Latest maintenance date
- Longitude
- Latitude

In [None]:
kept_columns = [
    'nom', 'adr_num','adr_voie',
    'com_cp', 'com_nom', 
    'tel1',
    'freq_mnt', 'dermnt',
    'lat_coor1', 'long_coor1']
data_filter = data.filter(items=kept_columns)
data_filter


Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000,Montpellier,334 67 64 87 70,tous les ans,2019-05-15,3.93392108647369,43.6136351580956
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
2,MEDIATHEQUE William SHAKESPEARE,,,0,,,,,,
3,MIBI,,,0,,,,,,
4,MAISON POUR TOUS LEO LAGRANGE,155,Rue de Bologne,34172,Montpellier,+334 67 40 33 57,Tous les ans,2020-10-06,,
...,...,...,...,...,...,...,...,...,...,...
270,Zoo de Lunaret,,avenue agropolis,34090,Montpellier,334 67 54 45 20,Tous les ans,2019-05-14,3.87346840734586,43.6409437259243
271,Gymnase Jean Bouin,-,Avenue du Biterrois,34080,Montpellier,334 67 75 44 43,Tous les ans,2019-11-01,3.82081921967322,43.6327880433471
272,Centre Culturel Rabelais,29,boulevard Sarrail,34000,Montpellier,334 67 34 71 33,2019-12-01,Tous les ans,3.88032026773112,43.6106902122358
273,,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier,,,,3.86982233698134,43.6127835439949


In [None]:
data_filter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   nom         275 non-null    object
 1   adr_num     275 non-null    object
 2   adr_voie    275 non-null    object
 3   com_cp      275 non-null    int64 
 4   com_nom     275 non-null    object
 5   tel1        275 non-null    object
 6   freq_mnt    275 non-null    object
 7   dermnt      275 non-null    object
 8   lat_coor1   275 non-null    object
 9   long_coor1  275 non-null    object
dtypes: int64(1), object(9)
memory usage: 21.6+ KB


## Extract problematic cases to specify cleaning functions

Use this space to explore the dataframe and identify problems in formatting and sanitizing. Save the indexes of interesting problematic cases in the following set `idx_problem_cases`, as you expore the data.

For instance, if you identify examples 3 and 45 to be dirty, add them to the set with:
`idx_problem_cases.update({3, 45})`

In [None]:
idx_problem_cases = set()

### Name

In [None]:
data_filter.nom

0      Plateau sportif de GrammontTerrain 9, 10, 11
1                 MEDIATHEQUE JEAN-JACQUES ROUSSEAU
2                   MEDIATHEQUE William SHAKESPEARE
3                                              MIBI
4                     MAISON POUR TOUS LEO LAGRANGE
                           ...                     
270                                  Zoo de Lunaret
271                              Gymnase Jean Bouin
272                        Centre Culturel Rabelais
273                                                
274                  Maison pour tous André Chamson
Name: nom, Length: 275, dtype: object

### Address data

In [None]:
data_filter.filter(regex=r"adr_|com_")

Unnamed: 0,adr_num,adr_voie,com_cp,com_nom
0,,avenue albert Einstein,34000,Montpellier
1,,,0,
2,,,0,
3,,,0,
4,155,Rue de Bologne,34172,Montpellier
...,...,...,...,...
270,,avenue agropolis,34090,Montpellier
271,-,Avenue du Biterrois,34080,Montpellier
272,29,boulevard Sarrail,34000,Montpellier
273,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier


#### adr_num field

In [None]:
data_filter.adr_num.dropna().values

array([' ', ' ', ' ', ' ', '155', '154', '1', '3490', ' ', '20', '830',
       ' ', '135', '21', '280', '570', '14', '164', '41', '3', '28', ' ',
       '50', ' ', '10', '842', '2', '950', '14', '1', '2', ' ', '280',
       '6', '125', '655', '45', '158', '119', '14', '-', '1000', '85',
       '99', '13', '43', '170', '-', '424 - 460', '843', '196 - 156',
       '99', '300', '123', '110', '20', '230', '237', '67', '1933', '551',
       '971', '380', '100', '122', '26', '215', '-', '60', '-', ' ', '88',
       '1375', '183', '219 - 289', '45', '5', '117', '50', '8', '1',
       '19 bis', '1945', '789', '25', ' ', '1', ' ', '150', ' ', '18',
       '1', '237', '1071', '118', '111', '55', '280', ' ', ' ', '-', '-',
       '-', ' ', '50', '50', '2', '67', '419', '1184', ' ', '260', ' ',
       ' ', ' ', '175', ' ', ' ', '10', '13', '3', '1', '1247', '64', '7',
       '205', '1330', '74', '501', '-', '694 -700', '240', ' ', ' ', '16',
       '2', ' ', '1784', '50', '50', '50', '130', '18', 

In [None]:
empty = data_filter.adr_num.str.fullmatch(r"[\s-]*")
only_nbr = data_filter.adr_num.str.fullmatch(r"\d+")
data_filter[ ~empty & ~only_nbr] 

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
48,Ecole élémentaire Winston Churchill - Ecole ma...,424 - 460,rue du lavandin,34070,Montpellier,334 67 42 54 64,Tous les ans,2019-12-01,3.85476904201268,43.5995832643803
50,Ecole élémentaire Jean Mermoz -Ecole maternell...,196 - 156,rue de la Croix du Sud,34000,Montpellier,334 67 65 04 66,Tous les ans,2019-12-01,3.89205258481797,43.6141016426732
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire 34000 Montpellier,34000,Montpellier,334 67 64 40 54,Tous les ans,2019-12-01,3.89640622981796,43.5929214621688
81,Poste de police Ecusson Centre ville,19 bis,rue durand,34000,Montpellier,334 67 34 70 89,tous les ans,2019-04-16,3.87860752258093,43.6050174875771
130,Ecole maternelle Aliénor-d'Aquitaine - Ecole é...,694 -700,rue Jacques-Bounin,34070,Montpellier,334 67 27 46 12,Tous les ans,2019-12-01,3.86476856812559,43.5883499187015
233,Cafétéria UFR AES,257-269,rue Vendémiaire,0,Montpellier,,,,3.89935221425315,43.6030249922346
273,,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier,,,,3.86982233698134,43.6127835439949


In [None]:
idx_problem_cases.update({48, 81, 130, 273}) 

In [None]:
data_filter[data_filter.adr_num.str.fullmatch(r"[\s]+")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000,Montpellier,334 67 64 87 70,tous les ans,2019-05-15,3.93392108647369,43.6136351580956
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
2,MEDIATHEQUE William SHAKESPEARE,,,0,,,,,,
3,MIBI,,,0,,,,,,
8,MOCO,,,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
255,CHRU euromed,,rue du caduce,0,Montpellier,,,,3.8350137109333,43.6425510750446
256,CHU Lapeyronie hall d'accueil,,Pont Lapeyronie,0,Montpellier,,,,3.85207584346259,43.6301375251917
257,CHRU lapeyronie,,avenue du doyen gaston giraud,0,Montpellier,,,,3.85081292585135,43.6313022653456
261,Cirad,,avenue agropolis,0,Montpellier,,,,3.86843080997092,43.6504884332442


In [None]:
idx_problem_cases.update({0}) 

In [None]:
data_filter[data_filter.adr_num.str.fullmatch(r"[-]+")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
40,Club Maillan,-,Rue Georges Brassens,34000,Montpellier,+334 67 52 77 92,,,,
47,Halles Jacques Coeur,-,Boulevard d'Antigone,34000,Montpellier,334 67 34 70 00,Tous les ans,2019-12-01,3.88939149901486,43.6092379806075
67,Ecole maternelle Jules-Michelet - Ecole élémen...,-,Rue des Araucarias,34080,Montpellier,334 67 10 06 05,Tous les ans,2019-12-01,3.83873856808814,43.6156815982488
69,Piscine centre nautique neptune,-,-,0,Montpellier,-,,,3.81486877448227,43.6203748790079
100,Les Halles Laissac,-,Place Alexandre Laissac,34000,Montpellier,334 67 34 70 00,Tous les ans,2019-12-01,3.87665572976354,43.6061966446183
101,Salle Guillaume-de-Nogaret,-,rue Pitot,34000,Montpellier,334 67 34 71 33,Tous les ans,2019-12-01,3.87039545403024,43.6121743782168
102,Maison pour tous Georges Brassens,-,Place Jacques-Brel,34080,Montpellier,334 67 40 40 11,Tous les ans,2019-12-01,3.81801006500697,43.6421717315658
129,Ecole élémentaire André Malraux,-,rue Joan Miro,34000,Montpellier,334 67 20 35 50,Tous les ans,2019-12-01,3.90796877117967,43.6023794136883
271,Gymnase Jean Bouin,-,Avenue du Biterrois,34080,Montpellier,334 67 75 44 43,Tous les ans,2019-11-01,3.82081921967322,43.6327880433471


In [None]:
idx_problem_cases.update({69}) 

#### adr_voie field

In [None]:
data_filter.adr_voie

0                    avenue albert Einstein
1                                          
2                                          
3                                          
4                            Rue de Bologne
                       ...                 
270                        avenue agropolis
271                     Avenue du Biterrois
272                       boulevard Sarrail
273    1 place Jacques Mirouse, MONTPELLIER
274                      rue Gustave Eiffel
Name: adr_voie, Length: 275, dtype: object

In [None]:
data_filter[data_filter.adr_voie.str.contains(r'\d\d\d\d\d')]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire 34000 Montpellier,34000,Montpellier,334 67 64 40 54,Tous les ans,2019-12-01,3.89640622981796,43.5929214621688
97,Ecole Ludwig-Van-Beethoven,280,rue du mas Nouguier 34070 Montpellier,34070,Montpellier,334 67 27 66 71,Tous les ans,2019-12-01,3.85685043630665,43.5849431576184
108,Vestiaire/tribune CLAUDE BEAL,419,avenue du Dr Jacques Fourcade 34000 Montpellier,34000,MONTPELLIER,334 67 65 70 86,Tout les ans,2019-11-01,3.89668282061293,43.5911769531706


In [None]:
idx_problem_cases.update({74}) 

#### com_cp field

In [None]:
data_filter.com_cp.unique()

array([34000,     0, 34172, 34267, 34070, 34090, 34080, 34263],
      dtype=int64)

In [None]:
data_filter[data_filter.com_cp < 34000]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
2,MEDIATHEQUE William SHAKESPEARE,,,0,,,,,,
3,MIBI,,,0,,,,,,
5,Piscine M. SPILLIAERT,154,rue C.Desmoulins,0,Montpellier,334 67 42 00 92,,,3.84314264933544,43.5982457955311
8,MOCO,,,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
259,Les Jardins de Grasse,1482,rue de Saint-Priest,0,Montpellier,,,,3.83534242876355,43.6375404973304
260,Parcs Nationaux de France,1037,rue Jean François Breton,0,Montpellier,,,,3.87871734331072,43.6474178462261
261,Cirad,,avenue agropolis,0,Montpellier,,,,3.86843080997092,43.6504884332442
262,Boulodrome Bernard Gasset,122,avenue Maurice Planès,0,Montpellier,,,,3.84329173229139,43.5967806417754


In [None]:
idx_problem_cases.update({1}) 

#### com_nom field

In [None]:
data_filter.com_nom.unique()

array(['Montpellier', ' ', 'MONTPELLIER'], dtype=object)

In [None]:
data_filter[data_filter.com_nom.str.contains(r'MONTPELLIER') 
            | data_filter.com_nom.str.fullmatch(r' ')]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
2,MEDIATHEQUE William SHAKESPEARE,,,0,,,,,,
3,MIBI,,,0,,,,,,
8,MOCO,,,0,,,,,,
11,BELEM,,,0,,,,,,
18,"EHPAD ""Michel BELORGEOT""",41.0,impasse des Moulins,34000,,+334 67 40 04 44,,,,
19,EHPAD Montpellieret,3.0,rue Fabre,34000,,+334 67 66 14 98,,,,
21,MEDIATHEQUE Victor HUGO,,,0,,,,,,
23,MEDIATHEQUE GARCIA LLORCA,,,0,,,,,,
31,DAE Extérieur rue syracuse,,Rue syracuse,34000,,334 67 34 70 26,,,,


In [None]:
idx_problem_cases.update([108, 18])

### Contact info

In [None]:
data_filter.tel1

0       334 67 64 87 70
1                      
2                      
3                      
4      +334 67 40 33 57
             ...       
270     334 67 54 45 20
271     334 67 75 44 43
272     334 67 34 71 33
273                    
274     334 67 75 10 55
Name: tel1, Length: 275, dtype: object

In [None]:
prefix = data_filter.tel1.str.split(n=1, expand=True)[0]
prefix.unique()

array(['334', None, '+334', '-', '336', '06', '337'], dtype=object)

In [None]:
data_filter[data_filter.tel1.str.match(r"-")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
45,Eglise Notre-Dame-des-Tables,43,Rue de l'Aiguillerie,34000,Montpellier,-,Tous les ans,2019-12-01,3.87957276977398,43.6120901983009
69,Piscine centre nautique neptune,-,-,0,Montpellier,-,,,3.81486877448227,43.6203748790079
70,Piscine Olympique Antigone,,-,0,Montpellier,-,,,3.89308121814824,43.6074454556324


In [None]:
data_filter[data_filter.tel1.str.match(r"\+(?:33)")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
4,MAISON POUR TOUS LEO LAGRANGE,155,Rue de Bologne,34172,Montpellier,+334 67 40 33 57,Tous les ans,2020-10-06,,
9,Le Petit Prince de Boutonnet,20,Rue Emile Littré,34090,Montpellier,+334 67 72 13 12,Tous les ans,2020-11-06,,
10,EHPAD Pierre LAROQUE,830,rue de Salaison,34000,Montpellier,+334 67 16 67 00,,,,
12,EHPAD Françoise GAUFFIER,135,Zac Ovalie rue André Puig Aubert,34000,Montpellier,+334 99 54 96 70,,,,
13,Crèche Edelweiss,21,rue Général Riu,34000,Montpellier,+334 67 64 08 11,Tous les ans,2020-11-05,,
...,...,...,...,...,...,...,...,...,...,...
36,Club Laure Moulin,45,rue Frédéric Bazille,34000,Montpellier,+334 67 52 77 92,,,,
37,Club Aiguelongue,158,Rue Raoul Follereau,34000,Montpellier,+334 67 52 77 92,,,,
38,EHPAD Les Aubes,119,rue St André de Novigens,34000,Montpellier,+334 67 72 06 98,,,,
39,Club Galzy,14,allée Clémenville 34 000 - Montpellier,34000,Montpellier,+334 67 52 77 92,,,,


In [None]:
data_filter[data_filter.tel1.str.match(r"0")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
211,DELL,1,rond-point Benjamin Franklin,0,Montpellier,06 58 57 85 24,,,3.91169364237597,43.6184228878598


In [None]:
idx_problem_cases.update([69, 211])

### Latest maintenance date

In [None]:
date_fmt = data_filter.dermnt.str.match(r"\d\d\d\d-\d\d-\d\d")
data_filter.dermnt[~date_fmt].unique()

array([' ', '2018-12-6', 'Tous les ans'], dtype=object)

In [None]:
empty = data_filter.dermnt.str.match(r"\s+")
data_filter.dermnt[~date_fmt & ~empty]

85        2018-12-6
272    Tous les ans
Name: dermnt, dtype: object

In [None]:
idx_problem_cases.update([85,272])

### Latitude and longitude

In [None]:
data_filter.filter(regex=r"_coor")

Unnamed: 0,lat_coor1,long_coor1
0,3.93392108647369,43.6136351580956
1,,
2,,
3,,
4,,
...,...,...
270,3.87346840734586,43.6409437259243
271,3.82081921967322,43.6327880433471
272,3.88032026773112,43.6106902122358
273,3.86982233698134,43.6127835439949


In [None]:
data_filter[~data_filter.lat_coor1.str.match(r"[\d]+.[\d]+")
            & data_filter.lat_coor1.str.match(r"-")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
34,Siège du CCAS (Banque d'Acceuil),125,place Thermidor,34000,Montpellier,+334 99 52 77 53,,,-,43.6020241317034


In [None]:
idx_problem_cases.add(34)

In [None]:
data_filter[~data_filter.long_coor1.str.match(r"[\d]+.[\d]+")
            & data_filter.long_coor1.str.match(r"")]

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
2,MEDIATHEQUE William SHAKESPEARE,,,0,,,,,,
3,MIBI,,,0,,,,,,
4,MAISON POUR TOUS LEO LAGRANGE,155,Rue de Bologne,34172,Montpellier,+334 67 40 33 57,Tous les ans,2020-10-06,,
8,MOCO,,,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
36,Club Laure Moulin,45,rue Frédéric Bazille,34000,Montpellier,+334 67 52 77 92,,,,
37,Club Aiguelongue,158,Rue Raoul Follereau,34000,Montpellier,+334 67 52 77 92,,,,
38,EHPAD Les Aubes,119,rue St André de Novigens,34000,Montpellier,+334 67 72 06 98,,,,
39,Club Galzy,14,allée Clémenville 34 000 - Montpellier,34000,Montpellier,+334 67 52 77 92,,,,


## Review selected cases and save as sample dirty data

In [None]:
sample_dirty = data_filter.loc[list(idx_problem_cases)]
sample_dirty

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000,Montpellier,334 67 64 87 70,tous les ans,2019-05-15,3.93392108647369,43.6136351580956
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
130,Ecole maternelle Aliénor-d'Aquitaine - Ecole é...,694 -700,rue Jacques-Bounin,34070,Montpellier,334 67 27 46 12,Tous les ans,2019-12-01,3.86476856812559,43.5883499187015
69,Piscine centre nautique neptune,-,-,0,Montpellier,-,,,3.81486877448227,43.6203748790079
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire 34000 Montpellier,34000,Montpellier,334 67 64 40 54,Tous les ans,2019-12-01,3.89640622981796,43.5929214621688
272,Centre Culturel Rabelais,29,boulevard Sarrail,34000,Montpellier,334 67 34 71 33,2019-12-01,Tous les ans,3.88032026773112,43.6106902122358
81,Poste de police Ecusson Centre ville,19 bis,rue durand,34000,Montpellier,334 67 34 70 89,tous les ans,2019-04-16,3.87860752258093,43.6050174875771
273,,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier,,,,3.86982233698134,43.6127835439949
18,"EHPAD ""Michel BELORGEOT""",41,impasse des Moulins,34000,,+334 67 40 04 44,,,,
211,DELL,1,rond-point Benjamin Franklin,0,Montpellier,06 58 57 85 24,,,3.91169364237597,43.6184228878598


In [None]:
sample_dirty.drop(index=[18, 48])

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000,Montpellier,334 67 64 87 70,tous les ans,2019-05-15,3.93392108647369,43.6136351580956
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,,,
130,Ecole maternelle Aliénor-d'Aquitaine - Ecole é...,694 -700,rue Jacques-Bounin,34070,Montpellier,334 67 27 46 12,Tous les ans,2019-12-01,3.86476856812559,43.5883499187015
69,Piscine centre nautique neptune,-,-,0,Montpellier,-,,,3.81486877448227,43.6203748790079
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire 34000 Montpellier,34000,Montpellier,334 67 64 40 54,Tous les ans,2019-12-01,3.89640622981796,43.5929214621688
272,Centre Culturel Rabelais,29,boulevard Sarrail,34000,Montpellier,334 67 34 71 33,2019-12-01,Tous les ans,3.88032026773112,43.6106902122358
81,Poste de police Ecusson Centre ville,19 bis,rue durand,34000,Montpellier,334 67 34 70 89,tous les ans,2019-04-16,3.87860752258093,43.6050174875771
273,,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier,,,,3.86982233698134,43.6127835439949
211,DELL,1,rond-point Benjamin Franklin,0,Montpellier,06 58 57 85 24,,,3.91169364237597,43.6184228878598
85,Gymnase François Spinosi,,Rue Pierre Gilles de Gennes,34000,Montpellier,334 67 15 90 35,tous les ans,2018-12-6,3.91771559166406,43.5989740313524


In [None]:
sample_dirty.to_csv('data/sample_dirty.csv')
sample_dirty.to_pickle('data/sample_dirty.pkl')
# sample_dirty.to_hdf('data/sample_dirty.h5')

In [None]:
(
pd.read_csv('data/sample_dirty.csv').equals(sample_dirty),
pd.read_pickle('data/sample_dirty.pkl').equals(sample_dirty),
# pd.read_hdf('data/sample_dirty.h5').equals(sample_dirty)
)

(False, True)

# Creating test target dataframes

## Create sample_formatted from sample_dirty
Content for this dataframe was done manually in a copy of the csv file. We need to load it and adjust data formats.

In [None]:
sample_formatted = pd.read_csv(
    'data/sample_formatted.csv', 
    index_col=0,
    dtype={
        'nom': 'string',
        'adr_num': 'string',
        'adr_voie': 'string',
        'com_cp': 'string',
        'com_nom': 'string',
        'tel1': 'string',
        'freq_mnt': 'string',
        'lat_coor1': 'float',
        'long_coor1': 'float',
    },
    na_values=[''],
    parse_dates=[8],
)

sample_formatted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 0 to 48
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   nom         13 non-null     string        
 1   adr_num     11 non-null     string        
 2   adr_voie    13 non-null     string        
 3   com_cp      14 non-null     string        
 4   com_nom     12 non-null     string        
 5   tel1        11 non-null     string        
 6   freq_mnt    7 non-null      string        
 7   dermnt      7 non-null      datetime64[ns]
 8   lat_coor1   11 non-null     float64       
 9   long_coor1  12 non-null     float64       
dtypes: datetime64[ns](1), float64(2), string(7)
memory usage: 1.2 KB


In [None]:
sample_formatted

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000,Montpellier,04 67 64 87 70,tous les ans,2019-05-15,3.933921,43.613635
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,0,,,,NaT,,
130,Ecole maternelle Aliénor-d'Aquitaine - Ecole é...,694 -700,rue Jacques-Bounin,34070,Montpellier,04 67 27 46 12,tous les ans,2019-12-01,3.864769,43.58835
69,Piscine centre nautique neptune,-,-,0,Montpellier,,,NaT,3.814869,43.620375
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire 34000 Montpellier,34000,Montpellier,04 67 64 40 54,tous les ans,2019-12-01,3.896406,43.592921
272,Centre Culturel Rabelais,29,boulevard Sarrail,34000,Montpellier,04 67 34 71 33,,NaT,3.88032,43.61069
81,Poste de police Ecusson Centre ville,19 bis,rue durand,34000,Montpellier,04 67 34 70 89,tous les ans,2019-04-16,3.878608,43.605017
273,,"1 place Jacques Mirouse, MONTPELLIER","1 place Jacques Mirouse, MONTPELLIER",34000,Montpellier,,,NaT,3.869822,43.612784
18,"EHPAD ""Michel BELORGEOT""",41,impasse des Moulins,34000,,04 67 40 04 44,,NaT,,
211,DELL,1,rond-point Benjamin Franklin,0,Montpellier,06 58 57 85 24,,NaT,3.911694,43.618423


## Create sample_sanitized from sample_formatted

Again, content defined in a csv

In [None]:
sample_sanitized = pd.read_csv(
    'data/sample_sanitized.csv', 
    index_col=0,
    dtype={
        'nom': 'string',
        'adr_num': 'string',
        'adr_voie': 'string',
        'com_cp': 'string',
        'com_nom': 'string',
        'tel1': 'string',
        'freq_mnt': 'string',
        'lat_coor1': 'float',
        'long_coor1': 'float',
    },
    na_values=[''],
    parse_dates=[8],  # dermnt column
)


sample_sanitized.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 0 to 48
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   nom         13 non-null     string        
 1   adr_num     10 non-null     string        
 2   adr_voie    12 non-null     string        
 3   com_cp      11 non-null     string        
 4   com_nom     12 non-null     string        
 5   tel1        11 non-null     string        
 6   freq_mnt    7 non-null      string        
 7   dermnt      7 non-null      datetime64[ns]
 8   lat_coor1   11 non-null     float64       
 9   long_coor1  12 non-null     float64       
dtypes: datetime64[ns](1), float64(2), string(7)
memory usage: 1.2 KB


In [None]:
sample_sanitized

Unnamed: 0,nom,adr_num,adr_voie,com_cp,com_nom,tel1,freq_mnt,dermnt,lat_coor1,long_coor1
0,"Plateau sportif de GrammontTerrain 9, 10, 11",,avenue albert Einstein,34000.0,Montpellier,04 67 64 87 70,tous les ans,2019-05-15,3.933921,43.613635
1,MEDIATHEQUE JEAN-JACQUES ROUSSEAU,,,,,,,NaT,,
130,Ecole maternelle Aliénor-d'Aquitaine - Ecole é...,694 - 700,rue Jacques-Bounin,34070.0,Montpellier,04 67 27 46 12,tous les ans,2019-12-01,3.864769,43.58835
69,Piscine centre nautique neptune,,,,Montpellier,,,NaT,3.814869,43.620375
74,Ecole maternelle Paul-Eluard - Ecole élémentai...,219 - 289,rue de Saint Hilaire,34000.0,Montpellier,04 67 64 40 54,tous les ans,2019-12-01,3.896406,43.592921
272,Centre Culturel Rabelais,29,boulevard Sarrail,34000.0,Montpellier,04 67 34 71 33,,NaT,3.88032,43.61069
81,Poste de police Ecusson Centre ville,19 bis,rue durand,34000.0,Montpellier,04 67 34 70 89,tous les ans,2019-04-16,3.878608,43.605017
273,,1,place Jacques Mirouse,34000.0,Montpellier,,,NaT,3.869822,43.612784
18,"EHPAD ""Michel BELORGEOT""",41,impasse des Moulins,34000.0,,04 67 40 04 44,,NaT,,
211,DELL,1,rond-point Benjamin Franklin,,Montpellier,06 58 57 85 24,,NaT,3.911694,43.618423


## Compare dataframe after cleaning to sample_sanitized

In [None]:
from loader import load_clean_dataframe

loaded_data = load_clean_dataframe(sample_dirty)

loaded_data.equals(sample_sanitized)

False