# Master TIDE - Conférences Python 2020

Francis Wolinski

&copy; 2020 Yotta Conseil

# 3. Pandas : manipulations et modifications des données

In [1]:
# import des modules usuels
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# commande magique pour l'affichage des graphiques
%matplotlib inline

# options d'affichage
pd.set_option("display.max_rows", 16)
plt.style.use('seaborn-darkgrid')

In [2]:
# chargement des données
geo = pd.read_csv("correspondance-code-insee-code-postal.csv",
                   sep=';',
                   usecols=range(11),
                  index_col="Code INSEE")
geo.sort_index(inplace=True)
geo

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
01001,01400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242.0,1565.0,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273..."
01002,01640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483.0,912.0,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089..."
01004,01500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379.0,2448.0,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190..."
01005,01330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290.0,1605.0,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580..."
01006,01300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589.0,602.0,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854..."
...,...,...,...,...,...,...,...,...,...,...
97613,97650,M'TSANGAMOUJI,MAYOTTE,MAYOTTE,Chef-lieu canton,96.0,2155.0,5.0,"-12.7513099309, 45.0871696871","{""type"": ""Polygon"", ""coordinates"": [[[45.10168..."
97614,97670,OUANGANI,MAYOTTE,MAYOTTE,Chef-lieu canton,175.0,1828.0,6.6,"-12.8370955196, 45.1379095497","{""type"": ""Polygon"", ""coordinates"": [[[45.15401..."
97615,97610,PAMANDZI,MAYOTTE,MAYOTTE,Chef-lieu canton,52.0,426.0,9.1,"-12.7961353309, 45.2842063102","{""type"": ""Polygon"", ""coordinates"": [[[45.29645..."
97616,97640,SADA,MAYOTTE,MAYOTTE,Chef-lieu canton,130.0,1085.0,8.0,"-12.8611649609, 45.1185503145","{""type"": ""Polygon"", ""coordinates"": [[[45.13226..."


## 3.1 Tri
La méthode `sort_values()` permet de trier un *DataFrame* selon les valeurs d'une ou plusieurs colonnes (ordre lexicographique) et la méthode `sort_index()` selon les valeurs de l'index.

Pour trier selon l'ordre inverse on utilise l'option `ascending=False`.

Ces méthodes retournent des copies du `DataFrame` initial, sauf si l'option `inplace=True` est utilisée. Dans ce cas l'objet est effectivement modifié.

In [3]:
# tri selon l'altitude
geo.sort_values("Altitude Moyenne").head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
34192,34250,PALAVAS-LES-FLOTS,HERAULT,LANGUEDOC-ROUSSILLON,Commune simple,0.0,920.0,6.0,"43.533536456, 3.92620839566","{""type"": ""Polygon"", ""coordinates"": [[[3.907955..."
59404,59122,LES MOERES,NORD,NORD-PAS-DE-CALAIS,Commune simple,0.0,1967.0,0.8,"51.0240965472, 2.5473689484","{""type"": ""Polygon"", ""coordinates"": [[[2.572476..."
59605,59229,UXEM,NORD,NORD-PAS-DE-CALAIS,Commune simple,1.0,808.0,1.3,"51.0239750832, 2.49246889837","{""type"": ""Polygon"", ""coordinates"": [[[2.505855..."
29085,29980,ILE-TUDY,FINISTERE,BRETAGNE,Commune simple,1.0,128.0,0.7,"47.8527643575, -4.16160884308","{""type"": ""Polygon"", ""coordinates"": [[[-4.15131..."
66017,66420,LE BARCARES,PYRENEES-ORIENTALES,LANGUEDOC-ROUSSILLON,Commune simple,1.0,1539.0,4.0,"42.8127710984, 3.02834125149","{""type"": ""Polygon"", ""coordinates"": [[[3.025922..."


In [4]:
# tri selon l'altitude inverse
geo.sort_values("Altitude Moyenne", ascending=False).head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
73047,73480,BONNEVAL-SUR-ARC,SAVOIE,RHONE-ALPES,Commune simple,2713.0,11231.0,0.2,"45.3858269371, 7.09178979521","{""type"": ""Polygon"", ""coordinates"": [[[7.110695..."
73040,73480,BESSANS,SAVOIE,RHONE-ALPES,Commune simple,2649.0,15414.0,0.3,"45.292032537, 7.04532734274","{""type"": ""Polygon"", ""coordinates"": [[[7.110695..."
73290,73500,TERMIGNON,SAVOIE,RHONE-ALPES,Commune simple,2584.0,18010.0,0.4,"45.3441528771, 6.84789108583","{""type"": ""Polygon"", ""coordinates"": [[[6.854937..."
73304,73150,VAL-D'ISERE,SAVOIE,RHONE-ALPES,Commune simple,2583.0,10765.0,1.6,"45.4310113935, 6.99852444032","{""type"": ""Polygon"", ""coordinates"": [[[6.998183..."
38375,38520,SAINT-CHRISTOPHE-EN-OISANS,ISERE,RHONE-ALPES,Commune simple,2557.0,23740.0,0.1,"44.9290846615, 6.24454607842","{""type"": ""Polygon"", ""coordinates"": [[[6.336319..."


In [5]:
# tri selon l'altitude puis suivant le nom de la commune A->Z
geo.sort_values(["Altitude Moyenne", "Commune"]).head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
59404,59122,LES MOERES,NORD,NORD-PAS-DE-CALAIS,Commune simple,0.0,1967.0,0.8,"51.0240965472, 2.5473689484","{""type"": ""Polygon"", ""coordinates"": [[[2.572476..."
34192,34250,PALAVAS-LES-FLOTS,HERAULT,LANGUEDOC-ROUSSILLON,Commune simple,0.0,920.0,6.0,"43.533536456, 3.92620839566","{""type"": ""Polygon"", ""coordinates"": [[[3.907955..."
30003,30220,AIGUES-MORTES,GARD,LANGUEDOC-ROUSSILLON,Chef-lieu canton,1.0,5776.0,8.1,"43.5507249635, 4.18349802063","{""type"": ""Polygon"", ""coordinates"": [[[4.237933..."
59154,59380,COUDEKERQUE-VILLAGE,NORD,NORD-PAS-DE-CALAIS,Commune simple,1.0,1203.0,1.2,"50.9945680467, 2.42167016066","{""type"": ""Polygon"", ""coordinates"": [[[2.435660..."
29085,29980,ILE-TUDY,FINISTERE,BRETAGNE,Commune simple,1.0,128.0,0.7,"47.8527643575, -4.16160884308","{""type"": ""Polygon"", ""coordinates"": [[[-4.15131..."


In [6]:
# tri selon l'altitude puis suivant le nom de la commune Z->A
geo.sort_values(["Altitude Moyenne", "Commune"], ascending=[True, False]).head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
34192,34250,PALAVAS-LES-FLOTS,HERAULT,LANGUEDOC-ROUSSILLON,Commune simple,0.0,920.0,6.0,"43.533536456, 3.92620839566","{""type"": ""Polygon"", ""coordinates"": [[[3.907955..."
59404,59122,LES MOERES,NORD,NORD-PAS-DE-CALAIS,Commune simple,0.0,1967.0,0.8,"51.0240965472, 2.5473689484","{""type"": ""Polygon"", ""coordinates"": [[[2.572476..."
59605,59229,UXEM,NORD,NORD-PAS-DE-CALAIS,Commune simple,1.0,808.0,1.3,"51.0239750832, 2.49246889837","{""type"": ""Polygon"", ""coordinates"": [[[2.505855..."
59588,59229,TETEGHEM,NORD,NORD-PAS-DE-CALAIS,Commune simple,1.0,1896.0,7.1,"51.0158652678, 2.45278368133","{""type"": ""Polygon"", ""coordinates"": [[[2.449611..."
13096,13460,SAINTES-MARIES-DE-LA-MER,BOUCHES-DU-RHONE,PROVENCE-ALPES-COTE D'AZUR,Chef-lieu canton,1.0,37242.0,2.3,"43.50398297, 4.46396357194","{""type"": ""Polygon"", ""coordinates"": [[[4.597237..."


In [7]:
# tri selon l'index
geo = geo.sort_index()
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242.0,1565.0,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273..."
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483.0,912.0,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089..."
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379.0,2448.0,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190..."
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290.0,1605.0,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580..."
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589.0,602.0,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854..."


## 3.2 Modification de colonnes et conversions

Toutes les opérations de sélection permettent d'effectuer des modifications avec l'opérateur `=`.

Par exemple, il est possible de modifier toutes les valeurs d'une colonne.

In [8]:
# la superficie des communes est en hectares, on la passe en km2
geo["Superficie"] = geo["Superficie"] / 100.0  # ou bien geo["Superficie"] =/ 100.0
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242.0,15.65,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273..."
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483.0,9.12,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089..."
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379.0,24.48,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190..."
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290.0,16.05,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580..."
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589.0,6.02,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854..."


N. B. : La modification d'un objet issu d'un *DataFrame* (colonne par exemple) est répercutée sur l'objet initial.

On peut également appliquer une méthode de conversion de type sur une colonne.

In [9]:
# exemple de conversion
geo["Altitude Moyenne"] = geo["Altitude Moyenne"].astype(int)
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242,15.65,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273..."
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483,9.12,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089..."
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379,24.48,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190..."
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290,16.05,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580..."
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589,6.02,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854..."


Transformation d'une variable catégorielle sous forme de chaines de caractères et categorie.

A noter, la catégorie peut être ordonnée.

In [10]:
# info
geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36742 entries, 01001 to 97617
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Code Postal       36742 non-null  object 
 1   Commune           36742 non-null  object 
 2   Département       36742 non-null  object 
 3   Région            36742 non-null  object 
 4   Statut            36742 non-null  object 
 5   Altitude Moyenne  36742 non-null  int32  
 6   Superficie        36742 non-null  float64
 7   Population        36742 non-null  float64
 8   geo_point_2d      36742 non-null  object 
 9   geo_shape         36742 non-null  object 
dtypes: float64(2), int32(1), object(7)
memory usage: 2.9+ MB


In [11]:
sorted(["Commune simple", "Chef-lieu canton", "Sous-préfecture",
            "Préfecture", "Préfecture de région", "Capitale d'état"])

["Capitale d'état",
 'Chef-lieu canton',
 'Commune simple',
 'Préfecture',
 'Préfecture de région',
 'Sous-préfecture']

In [12]:
# conversion de la colonne Statut en catégorie

statuts = ["Commune simple", "Chef-lieu canton", "Sous-préfecture",
            "Préfecture", "Préfecture de région", "Capitale d'état"]

from pandas.api.types import CategoricalDtype
cat_statut = CategoricalDtype(categories=statuts, ordered=True)
geo["Statut"] = geo["Statut"].astype(cat_statut)

geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36742 entries, 01001 to 97617
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Code Postal       36742 non-null  object  
 1   Commune           36742 non-null  object  
 2   Département       36742 non-null  object  
 3   Région            36742 non-null  object  
 4   Statut            36742 non-null  category
 5   Altitude Moyenne  36742 non-null  int32   
 6   Superficie        36742 non-null  float64 
 7   Population        36742 non-null  float64 
 8   geo_point_2d      36742 non-null  object  
 9   geo_shape         36742 non-null  object  
dtypes: category(1), float64(2), int32(1), object(6)
memory usage: 2.7+ MB


In [13]:
%timeit geo.loc[geo['Statut']=='Chef-lieu canton']

2.25 ms ± 99 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
geo2 = geo.copy()
geo2['Statut'] = geo2['Statut'].astype(str)
%timeit geo2.loc[geo2['Statut']=='Chef-lieu canton']

5.2 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 3.3 Ajout de colonnes

In [15]:
# on ajoute la colonne "Densité"
geo["Densité"] = 1000 * geo["Population"] / geo["Superficie"]
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape,Densité
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242,15.65,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273...",51.118211
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483,9.12,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089...",21.929825
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379,24.48,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190...",547.385621
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290,16.05,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580...",99.688474
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589,6.02,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854...",16.611296


#### Ajout de colonnes en utilisant la méthode *apply()* appliquée à une colonne

La latitude et la longitude sont données sous la forme d'une chaîne de caractères dans la colonne *geo_point_2d*. La latitude et la longitude sont séparées par une virdule et un espace (", ")

In [16]:
# la colonne "geo_point_2d" est constituée de chaînes de caractères
geo["geo_point_2d"]

Code INSEE
01001     46.1534255214, 4.92611354223
01002     46.0091878776, 5.42801696363
01004      45.9608475114, 5.3729257777
01005     45.9961799872, 4.91227250796
01006     45.7494989044, 5.59432017366
                     ...              
97613    -12.7513099309, 45.0871696871
97614    -12.8370955196, 45.1379095497
97615    -12.7961353309, 45.2842063102
97616    -12.8611649609, 45.1185503145
97617    -12.7821666736, 45.1344279083
Name: geo_point_2d, Length: 36742, dtype: object

In [17]:
# par exemple
geo_point_2d = geo.loc["01001", "geo_point_2d"]
geo_point_2d

'46.1534255214, 4.92611354223'

Pour extraire les 2 grandeurs, on va utiliser la méthode **split()** qui retourne la liste des sous-chaînes séparées par une chaîne donnée.

On va l'utiliser avec la chaîne de séparation ", ".

N.B. : Si la chaîne était variable (par ex., un ou plusieurs espaces) on pourrait utiliser une expression réguière (module *re* de Python).

In [18]:
'toto,titi,tata'.split(',')

['toto', 'titi', 'tata']

In [19]:
'|'.join(['toto', 'titi', 'tata'])

'toto|titi|tata'

In [20]:
# application de la méthode split()
x = geo_point_2d.split(', ')
x

['46.1534255214', '4.92611354223']

Il va falloir extraire chacune des grandeurs et la transformer en nombre flottant.

In [21]:
# extraction de la latitude, indice 0 dans la liste retournée par *split()*
x[0]  # on obtient une chaîne de caractères

'46.1534255214'

In [22]:
float(x[0])  # on obtient enfin un nombre flottant représentant la latitude avec le convertisseur float()

46.1534255214

Il va falloir appliquer cette technique à chaque ligne de la colonne et aux deux grandeurs. Dans un langage classique, on appliquerait une boucle.

En pandas, la méthode *apply()* permet d'appliquer une fonction (ou une lambda) à chaque élement d'une *Series* ou d'un *DataFrame* et de retourner un objet avec le résultat de la fonction appliquée à chaque élément.

In [23]:
# on calcule  la latitude et la longitude et on ajoute les colones
geo["Latitude"] = geo["geo_point_2d"].apply(lambda x: float(x.split(', ')[0]))
geo["Longitude"] = geo["geo_point_2d"].apply(lambda x: float(x.split(', ')[1]))
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape,Densité,Latitude,Longitude
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242,15.65,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273...",51.118211,46.153426,4.926114
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483,9.12,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089...",21.929825,46.009188,5.428017
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379,24.48,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190...",547.385621,45.960848,5.372926
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290,16.05,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580...",99.688474,45.99618,4.912273
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589,6.02,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854...",16.611296,45.749499,5.59432


In [24]:
# on vérifie le type des colonne ajoutées
geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36742 entries, 01001 to 97617
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Code Postal       36742 non-null  object  
 1   Commune           36742 non-null  object  
 2   Département       36742 non-null  object  
 3   Région            36742 non-null  object  
 4   Statut            36742 non-null  category
 5   Altitude Moyenne  36742 non-null  int32   
 6   Superficie        36742 non-null  float64 
 7   Population        36742 non-null  float64 
 8   geo_point_2d      36742 non-null  object  
 9   geo_shape         36742 non-null  object  
 10  Densité           36742 non-null  float64 
 11  Latitude          36742 non-null  float64 
 12  Longitude         36742 non-null  float64 
dtypes: category(1), float64(5), int32(1), object(6)
memory usage: 4.8+ MB


Une autre manière de faire est d'utiliser la méthode `extract` capable d'extraire un motif sous forme d'expression régulière.

In [25]:
# méthode extract
geo["geo_point_2d"].str.extract("(.*), (.*)")

Unnamed: 0_level_0,0,1
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1
01001,46.1534255214,4.92611354223
01002,46.0091878776,5.42801696363
01004,45.9608475114,5.3729257777
01005,45.9961799872,4.91227250796
01006,45.7494989044,5.59432017366
...,...,...
97613,-12.7513099309,45.0871696871
97614,-12.8370955196,45.1379095497
97615,-12.7961353309,45.2842063102
97616,-12.8611649609,45.1185503145


In [26]:
# méthode extract
geo[['Latitude', 'Longitude']] = geo["geo_point_2d"].str.extract("(.*), (.*)").astype(float)
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape,Densité,Latitude,Longitude
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242,15.65,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273...",51.118211,46.153426,4.926114
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483,9.12,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089...",21.929825,46.009188,5.428017
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379,24.48,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190...",547.385621,45.960848,5.372926
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290,16.05,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580...",99.688474,45.99618,4.912273
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589,6.02,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854...",16.611296,45.749499,5.59432


<div class="alert alert-success">
<b>Exercice 1</b>
<ul>
    <li>Ajoutez une colonne 'CP Ville' avec le Code postal + un espace + et le nom de la Ville</li>
    <li>Ecrivez une fonction qui détermine la commune la plus proche d'un point à partir de sa latitude et sa longitude.</li>
    <li>Ajoutez une fonction de conversion pour pouvoir utiliser la première fonction avec un GPS (degrés, minutes, secondes).</li>
</ul>
</div>

In [3]:
geo['CP Ville'] = geo['Code Postal'] + ' ' + geo['Commune']
geo.head()

Unnamed: 0_level_0,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape,CP Ville
Code INSEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001,1400,L'ABERGEMENT-CLEMENCIAT,AIN,RHONE-ALPES,Commune simple,242.0,1565.0,0.8,"46.1534255214, 4.92611354223","{""type"": ""Polygon"", ""coordinates"": [[[4.926273...",01400 L'ABERGEMENT-CLEMENCIAT
1002,1640,L'ABERGEMENT-DE-VAREY,AIN,RHONE-ALPES,Commune simple,483.0,912.0,0.2,"46.0091878776, 5.42801696363","{""type"": ""Polygon"", ""coordinates"": [[[5.430089...",01640 L'ABERGEMENT-DE-VAREY
1004,1500,AMBERIEU-EN-BUGEY,AIN,RHONE-ALPES,Chef-lieu canton,379.0,2448.0,13.4,"45.9608475114, 5.3729257777","{""type"": ""Polygon"", ""coordinates"": [[[5.386190...",01500 AMBERIEU-EN-BUGEY
1005,1330,AMBERIEUX-EN-DOMBES,AIN,RHONE-ALPES,Commune simple,290.0,1605.0,1.6,"45.9961799872, 4.91227250796","{""type"": ""Polygon"", ""coordinates"": [[[4.895580...",01330 AMBERIEUX-EN-DOMBES
1006,1300,AMBLEON,AIN,RHONE-ALPES,Commune simple,589.0,602.0,0.1,"45.7494989044, 5.59432017366","{""type"": ""Polygon"", ""coordinates"": [[[5.614854...",01300 AMBLEON


In [30]:
# fonction recherche de ville
def ville(lat, long):
    dist = np.sqrt((geo['Latitude'] - lat)**2 + (geo['Longitude'] - long)**2)
    return geo.loc[dist.idxmin(), 'CP Ville']

In [36]:
# fonction recherche de ville
def ville(lat, long):
    dist2 = (geo['Latitude'] - lat)**2 + (geo['Longitude'] - long)**2
    return geo.loc[dist2.idxmin(), 'CP Ville']

In [32]:
# on applique la fonction à une coordonnée tirée au hasard
np.random.seed(0)
a, b = 41.5, 51.1  # latitude min et max de la France métropolitaine
lat = (b - a) * np.random.random() + a
a, b = -5.1, 9.5  # longitude min et max de la France métropolitaine
long = (b - a) * np.random.random() + a
print(lat, long)
ville(lat, long)

46.76860963770232 5.341764749037324


'71330 BOSJEAN'

In [33]:
# conversion degrés, minutes, secondes => décimal
def dms2dec(deg, mn, sec):
    return deg + mn / 60 + sec / 3600

In [34]:
# à partir de coordonnées GPS précises
ville(dms2dec(44, 40, 34), dms2dec(1, 52, 39))

'46320 ASSIER'

In [35]:
# Arbois 46° 54′ 13″ nord, 5° 46′ 29″ est
ville(dms2dec(46, 54, 13), dms2dec(5, 46, 29))

'39600 ARBOIS'

<div class="alert alert-success">
<b>Exercice 2</b>
<br />
La colonne "geo_shape" est formée de chaines de catactères en JSON (JavaScript Object Notation). Le format JSON est très utilisé comme le format XML.

<ul>
    <li>Utiliser la librairie Python json pour parser la colonne "geo_shape" : json.loads()</li>
    <li>Donner le décompte des valeurs accédées avec la clé "type".</li>
    <li>REPRISE SESSION</li>
    <li>Donner le décompte des longueurs des listes accédées avec la clé "coordinates".</li>
    <li>Quelle commune est la plus complexe géométriquement ?</li>
    <li>Quelles sont les villes qui sont de type "Polygon" mais dont la longueur des listes accédées avec la clé "coordinates" vaut 2 ?</li>
    <li>Pour ces villes vérifier que le premier polygone contient bien le second (enclave). NB : installer la librairie shapely, utiliser la classe Polygon de shapely.geometry. Sur Windows shapely peut nécessiter d'installer la dll "geos_c.dll" dans le répertoire "Library/bin" de votre environnement Python.</li>
</ul>
</div>

In [3]:
geo['geo_shape']

Code INSEE
01001    {"type": "Polygon", "coordinates": [[[4.926273...
01002    {"type": "Polygon", "coordinates": [[[5.430089...
01004    {"type": "Polygon", "coordinates": [[[5.386190...
01005    {"type": "Polygon", "coordinates": [[[4.895580...
01006    {"type": "Polygon", "coordinates": [[[5.614854...
                               ...                        
97613    {"type": "Polygon", "coordinates": [[[45.10168...
97614    {"type": "Polygon", "coordinates": [[[45.15401...
97615    {"type": "Polygon", "coordinates": [[[45.29645...
97616    {"type": "Polygon", "coordinates": [[[45.13226...
97617    {"type": "Polygon", "coordinates": [[[45.15256...
Name: geo_shape, Length: 36742, dtype: object

In [4]:
var = geo.loc['01001', 'geo_shape']
var

'{"type": "Polygon", "coordinates": [[[4.92627342692396, 46.1200517234555], [4.92199538031264, 46.12069170563269], [4.921996260482057, 46.12663667307488], [4.90936269187648, 46.129164756530656], [4.916016986429157, 46.14461766984426], [4.903080786126749, 46.157977074275074], [4.904571300671208, 46.16096059498421], [4.910603091203733, 46.180432597608316], [4.914119126867876, 46.18271538441311], [4.932895432675545, 46.18304616431434], [4.938729478686701, 46.17383114520867], [4.933038591703602, 46.16429901579509], [4.958286230120842, 46.153246666893935], [4.943330383388636, 46.145964148557894], [4.939154593056283, 46.12938048470421], [4.931056072485923, 46.12087123131487], [4.92627342692396, 46.1200517234555]]]}'

In [5]:
import json
x = json.loads(var)
x

{'type': 'Polygon',
 'coordinates': [[[4.92627342692396, 46.1200517234555],
   [4.92199538031264, 46.12069170563269],
   [4.921996260482057, 46.12663667307488],
   [4.90936269187648, 46.129164756530656],
   [4.916016986429157, 46.14461766984426],
   [4.903080786126749, 46.157977074275074],
   [4.904571300671208, 46.16096059498421],
   [4.910603091203733, 46.180432597608316],
   [4.914119126867876, 46.18271538441311],
   [4.932895432675545, 46.18304616431434],
   [4.938729478686701, 46.17383114520867],
   [4.933038591703602, 46.16429901579509],
   [4.958286230120842, 46.153246666893935],
   [4.943330383388636, 46.145964148557894],
   [4.939154593056283, 46.12938048470421],
   [4.931056072485923, 46.12087123131487],
   [4.92627342692396, 46.1200517234555]]]}

In [6]:
x['type']

'Polygon'

In [7]:
x['coordinates']#[0][0]

[[[4.92627342692396, 46.1200517234555],
  [4.92199538031264, 46.12069170563269],
  [4.921996260482057, 46.12663667307488],
  [4.90936269187648, 46.129164756530656],
  [4.916016986429157, 46.14461766984426],
  [4.903080786126749, 46.157977074275074],
  [4.904571300671208, 46.16096059498421],
  [4.910603091203733, 46.180432597608316],
  [4.914119126867876, 46.18271538441311],
  [4.932895432675545, 46.18304616431434],
  [4.938729478686701, 46.17383114520867],
  [4.933038591703602, 46.16429901579509],
  [4.958286230120842, 46.153246666893935],
  [4.943330383388636, 46.145964148557894],
  [4.939154593056283, 46.12938048470421],
  [4.931056072485923, 46.12087123131487],
  [4.92627342692396, 46.1200517234555]]]

In [9]:
geo['geo_shape'].apply(lambda x: json.loads(x)).loc['01001']

{'type': 'Polygon',
 'coordinates': [[[4.92627342692396, 46.1200517234555],
   [4.92199538031264, 46.12069170563269],
   [4.921996260482057, 46.12663667307488],
   [4.90936269187648, 46.129164756530656],
   [4.916016986429157, 46.14461766984426],
   [4.903080786126749, 46.157977074275074],
   [4.904571300671208, 46.16096059498421],
   [4.910603091203733, 46.180432597608316],
   [4.914119126867876, 46.18271538441311],
   [4.932895432675545, 46.18304616431434],
   [4.938729478686701, 46.17383114520867],
   [4.933038591703602, 46.16429901579509],
   [4.958286230120842, 46.153246666893935],
   [4.943330383388636, 46.145964148557894],
   [4.939154593056283, 46.12938048470421],
   [4.931056072485923, 46.12087123131487],
   [4.92627342692396, 46.1200517234555]]]}

In [10]:
geo['geo_shape'].apply(lambda x: json.loads(x)['type']).value_counts()

Polygon         36670
MultiPolygon       72
Name: geo_shape, dtype: int64

In [13]:
geo['geo_shape'].apply(lambda x: len(json.loads(x)['coordinates'])).value_counts()

1    36660
2       80
4        1
3        1
Name: geo_shape, dtype: int64

In [16]:
geo.loc[geo['geo_shape'].apply(lambda x: len(json.loads(x)['coordinates']))==4, 'CP Ville']

Code INSEE
83069    83400 HYERES
Name: CP Ville, dtype: object

In [17]:
geo.loc[geo['geo_shape'].apply(lambda x: len(json.loads(x)['coordinates']))==3, 'CP Ville']

Code INSEE
27701    27100 VAL-DE-REUIL
Name: CP Ville, dtype: object

In [21]:
geo.loc[(geo['geo_shape'].apply(lambda x: json.loads(x)['type']) == 'Polygon') &\
(geo['geo_shape'].apply(lambda x: len(json.loads(x)['coordinates'])) == 2),\
        'CP Ville']

Code INSEE
02232               02600 COYOLLES
05052               05600 EYGLIERS
09042    09240 LA BASTIDE-DE-SEROU
22209              22650 PLOUBALAY
2A272                20100 SARTENE
2B049              20214 CALENZANA
48080               48300 LANGOGNE
67486              67920 SUNDHOUSE
68078              68420 EGUISHEIM
71028            71270 BEAUVERNOIS
Name: CP Ville, dtype: object

In [6]:
from shapely.geometry import Polygon

Polygon?

In [7]:
ls Library\bin\geo*

 Le volume dans le lecteur C s'appelle OS
 Le num‚ro de s‚rie du volume est C689-0282

 R‚pertoire de C:\Users\Francis\Miniconda3\envs\tp\Library\bin

20/10/2020  12:56         1ÿ380ÿ352 geos.dll
20/10/2020  12:56           354ÿ304 geos_c.dll
               2 fichier(s)        1ÿ734ÿ656 octets
               0 R‚p(s)  47ÿ790ÿ641ÿ152 octets libres


Les formes géométriques des communes sont des polygones ou composées de plusieurs polygones.

## 3.4 Discrétisation de valeurs continues

La fonction `cut()` permet de discrétiser des variables continues. Elle renvoie un objet de type `Series` numérique qui partage le même index que l'objet Series initial.

A noter, on utilise la valeur `numpy.inf` qui désigne $+\infty$.

In [9]:
1e10 < np.inf

True

In [10]:
np.NINF

-inf

In [13]:
np.inf + np.NINF

nan

In [14]:
# une ville comporte plus de 2000 habitants
var = pd.cut(geo["Population"], [0.0, 2.0, np.inf], labels=["Village", "Ville"])
var

Code INSEE
01001    Village
01002    Village
01004      Ville
01005    Village
01006    Village
          ...   
97613      Ville
97614      Ville
97615      Ville
97616      Ville
97617      Ville
Name: Population, Length: 36742, dtype: category
Categories (2, object): ['Village' < 'Ville']

In [15]:
# décompte des villages et des villes de France
var.value_counts()

Village    30819
Ville       5008
Name: Population, dtype: int64

La fonction `qcut()` cherche à discrétiser les valeurs en quantiles identiques.

In [16]:
# On sépare en 2 catégories identiques
var = pd.qcut(geo["Population"], 2, labels=["MonVillage", "MaVille"])
var

Code INSEE
01001       MaVille
01002    MonVillage
01004       MaVille
01005       MaVille
01006    MonVillage
            ...    
97613       MaVille
97614       MaVille
97615       MaVille
97616       MaVille
97617       MaVille
Name: Population, Length: 36742, dtype: category
Categories (2, object): ['MonVillage' < 'MaVille']

In [17]:
# décompte des villages et des villes de France
var.value_counts()

MonVillage    18953
MaVille       17789
Name: Population, dtype: int64

In [18]:
# seuil de la population pour MonVillage
geo.loc[var == "MonVillage", "Population"].max()

0.4

In [19]:
# on vérifie qu'il s'agit bien de la médiane
geo["Population"].median()

0.4