<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Les-imports-et-configuration-du-document" data-toc-modified-id="Les-imports-et-configuration-du-document-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Les imports et configuration du document</a></span><ul class="toc-item"><li><span><a href="#La-lecture-des-fichiers-d'un-répértoire" data-toc-modified-id="La-lecture-des-fichiers-d'un-répértoire-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>La lecture des fichiers d'un répértoire</a></span></li></ul></li><li><span><a href="#La-temperature-mensuelle" data-toc-modified-id="La-temperature-mensuelle-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>La temperature mensuelle</a></span></li><li><span><a href="#La-météo-mensuelle" data-toc-modified-id="La-météo-mensuelle-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>La météo mensuelle</a></span></li><li><span><a href="#La-météo-----hebdomadaire" data-toc-modified-id="La-météo-----hebdomadaire-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>La météo     hebdomadaire</a></span></li><li><span><a href="#La-météo-----journalière" data-toc-modified-id="La-météo-----journalière-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>La météo     journalière</a></span></li></ul></div>

# Les imports et configuration du document

In [1]:
import pandas as pd 
import seaborn as sns
import warnings 
import os
import time
from datetime import datetime
from matplotlib import pyplot as plt

warnings.filterwarnings(action="ignore")

%matplotlib inline
if int(str(sns.__version__).split('.')[1]) > 8 : 
    plt.style.use('seaborn-v0_8-darkgrid')
else:
    plt.style.use('seaborn-darkgrid')
    
sns.set(font_scale=3)

In [2]:
import glob
csvs = glob.glob(os.path.join('../donnees/meteo', '**', '*.csv'), recursive=True)

In [54]:
[fichier for fichier in csvs ]

[]

In [4]:
def lectureFichiersMeteo(repertoire='../donnees/meteo_test'):
    listeFichiers = []
    for dirname, dirnames, filenames in os.walk(repertoire):
        for filename in filenames:
            if filename.rfind('synop') >= 0 :
                listeFichiers.append(os.path.join(dirname, filename))
        
    meteo = pd.concat([ pd.read_csv(fichier,
                           sep=';',
                           usecols=['numer_sta','date','dd','ff','t','u','vv',
                                    'pres','rr1','rr3','rr6','rr12','rr24'],
                           na_values='mq',
                           dtype={'numer_sta':str,'date':str},
                          ).rename( columns={'numer_sta':'Station',
                                              'date':'DateHeure',
                                              'dd':'DirectionVent',
                                              'ff':'VitesseVent',
                                              't':'Temperature',
                                              'u':'Humidite',
                                              'vv':'Visibilite',
                                              'pres':'Pression',
                                              'rr1' :'Precipitation01',
                                              'rr3' :'Precipitation03',
                                              'rr6' :'Precipitation06',
                                              'rr12':'Precipitation12',
                                              'rr24':'Precipitation24'}) 
                         for fichier in listeFichiers],
                         ignore_index=True)
    
    
    meteo["DateHeure"] = pd.to_datetime(meteo["DateHeure"], format='%Y%m%d%H%M%S')
    meteo.Temperature  = meteo.Temperature - 273.15
    meteo.Pression     = meteo.Pression / 100
    meteo.Visibilite   = meteo.Visibilite / 1000
    meteo['Mois']      = meteo.DateHeure.dt.month
    meteo['Annee']     = meteo.DateHeure.dt.year
    meteo['AnneeMois'] = meteo.DateHeure.dt.year*100 + meteo.DateHeure.dt.month
    meteo['AnneeJour'] = meteo.DateHeure.dt.year*1000 + meteo.DateHeure.dt.day
    meteo['Semaine']   = meteo.DateHeure.dt.isocalendar().week
    meteo['MoisJour']  = meteo.DateHeure.dt.month*100 + meteo.DateHeure.dt.day
    meteo['JourA']     = meteo.DateHeure.dt.dayofyear
    return meteo    

In [7]:
! dir ..\..\_git\donnees\meteo

 Le volume dans le lecteur C s’appelle Windows-SSD
 Le numéro de série du volume est 84DC-AF7F

 Répertoire de C:\Users\etien\Documents\CCI\_HUB\cours\2024-2025\big_data\_git\donnees\meteo

12/11/2024  14:11    <DIR>          .
18/11/2024  16:02    <DIR>          ..
12/11/2024  14:11    <DIR>          2017
12/11/2024  14:11    <DIR>          2018
12/11/2024  14:11    <DIR>          2019
12/11/2024  14:11    <DIR>          2020
12/11/2024  14:11    <DIR>          2021
12/11/2024  14:11    <DIR>          2022
12/11/2024  14:11    <DIR>          2023
               0 fichier(s)                0 octets
               9 Rép(s)  721 509 429 248 octets libres


## La lecture des fichiers d'un répértoire 

In [8]:
donnees = lectureFichiersMeteo(repertoire='../../_git/donnees/meteo')

In [9]:
donnees.tail()

Unnamed: 0,Station,DateHeure,DirectionVent,VitesseVent,Temperature,Humidite,Visibilite,Pression,Precipitation01,Precipitation03,Precipitation06,Precipitation12,Precipitation24,Mois,Annee,AnneeMois,AnneeJour,Semaine,MoisJour,JourA
1172473,81401,2023-10-31 21:00:00,40.0,1.3,30.1,69.0,,1007.2,0.0,0.0,0.0,0.2,0.2,10,2023,202310,2023031,44,1031,304
1172474,81405,2023-10-31 21:00:00,80.0,4.5,29.8,75.0,48.32,1007.5,0.0,0.0,0.0,0.0,0.4,10,2023,202310,2023031,44,1031,304
1172475,81408,2023-10-31 21:00:00,60.0,3.0,30.5,67.0,,1006.9,0.0,0.0,0.0,0.0,0.0,10,2023,202310,2023031,44,1031,304
1172476,81415,2023-10-31 21:00:00,50.0,2.8,32.5,57.0,,,0.0,0.0,0.0,0.0,0.0,10,2023,202310,2023031,44,1031,304
1172477,89642,2023-10-31 21:00:00,90.0,3.1,-7.5,60.0,,949.0,,,,,,10,2023,202310,2023031,44,1031,304


In [15]:
donnees.shape

(1172478, 20)

In [17]:
postes = pd.read_csv('../../_git/donnees/postesSynop.csv',sep=';',dtype={'ID':str})
postes.Nom =postes.Nom.apply(lambda x : x if x in ['CLERMONT-FD','MONT-DE-MARSAN',
                                       'ST-PIERRE','ST-BARTHELEMY METEO'] 
                               else x[0:x.find('-')] 
                                    if x.find('-') != -1 else x).apply(lambda x : str(x).title())
postes.Altitude = postes.Altitude.astype('int16')
postes = postes[postes.ID < '08000']

In [18]:
postes.loc[postes.Latitude  < postes.Latitude.mean(),'Zone'] = 'S'
postes.loc[postes.Latitude  > postes.Latitude.mean(),'Zone'] = 'N'
postes.loc[postes.Longitude < postes.Longitude.mean(),'Zone'] += 'O'
postes.loc[postes.Longitude > postes.Longitude.mean(),'Zone'] += 'E'

In [19]:
postes.head()

Unnamed: 0,ID,Nom,Latitude,Longitude,Altitude,Zone
0,7005,Abbeville,50.136,1.834,69,NO
1,7015,Lille,50.57,3.0975,47,NE
2,7020,Pte De La Hague,49.725167,-1.939833,6,NO
3,7027,Caen,49.18,-0.456167,67,NO
4,7037,Rouen,49.383,1.181667,151,NO


In [20]:
donnees = postes.merge(donnees, how = "inner", 
                     left_on = "ID", right_on = "Station").drop(["ID","Station"], axis = "columns")

In [21]:
donnees['Precipitation'] =  donnees['Precipitation03'].combine_first(donnees['Precipitation06']/2)\
                                                      .combine_first(donnees['Precipitation12']/4)\
                                                      .combine_first(donnees['Precipitation24']/8)\
                                                      .combine_first(donnees['Precipitation01']*3)

In [22]:
donnees.drop(columns=['Precipitation06',
                    'Precipitation12',
                    'Precipitation24',
                    'Precipitation01',
                    'Precipitation03'], inplace=True)

In [23]:
donnees.set_index(['Nom', 'Latitude', 'Longitude', 'Altitude', 'Zone', 'DateHeure'],inplace=True)

In [26]:
donnees.to_parquet('../../_git/donnees/donnees_meteo.gzip',compression='gzip', engine='pyarrow')

In [28]:
!dir ..\..\_git\donnees\donnees_meteo.gzip

 Le volume dans le lecteur C s’appelle Windows-SSD
 Le numéro de série du volume est 84DC-AF7F

 Répertoire de C:\Users\etien\Documents\CCI\_HUB\cours\2024-2025\big_data\_git\donnees

18/11/2024  16:27         6 765 827 donnees_meteo.gzip
               1 fichier(s)        6 765 827 octets
               0 Rép(s)  721 498 374 144 octets libres


In [29]:
%%time
meteo = pd.read_parquet('../../_git/donnees/donnees_meteo.gzip', engine='pyarrow')
meteo.dtypes

CPU times: total: 1.27 s
Wall time: 441 ms


DirectionVent    float64
VitesseVent      float64
Temperature      float64
Humidite         float64
Visibilite       float64
Pression         float64
Mois               int32
Annee              int32
AnneeMois          int32
AnneeJour          int32
Semaine           UInt32
MoisJour           int32
JourA              int32
Precipitation    float64
dtype: object

In [30]:
meteo.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,DirectionVent,VitesseVent,Temperature,Humidite,Visibilite,Pression,Mois,Annee,AnneeMois,AnneeJour,Semaine,MoisJour,JourA,Precipitation
Nom,Latitude,Longitude,Altitude,Zone,DateHeure,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Abbeville,50.136,1.834,69,NO,2017-01-01 00:00:00,0.0,0.0,-3.9,96.0,1.74,1018.0,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136,1.834,69,NO,2017-01-01 03:00:00,0.0,0.0,-5.1,94.0,7.36,1015.8,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136,1.834,69,NO,2017-01-01 06:00:00,0.0,0.0,-4.1,96.0,3.5,1013.1,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136,1.834,69,NO,2017-01-01 09:00:00,0.0,0.0,-2.2,97.0,4.0,1011.7,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136,1.834,69,NO,2017-01-01 12:00:00,0.0,0.0,-0.9,98.0,4.0,1010.3,1,2017,201701,2017001,52,101,1,0.0


In [31]:
donnees = meteo.reset_index().groupby( ['Nom','Annee']).agg({'Temperature':['mean','median'], 
                                                             'Humidite':['mean','median'], 
                                                             'Visibilite':['mean','median']})
donnees.head(28)

Unnamed: 0_level_0,Unnamed: 1_level_0,Temperature,Temperature,Humidite,Humidite,Visibilite,Visibilite
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
Nom,Annee,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Abbeville,2017,11.424706,11.5,80.185813,83.0,17.110993,20.0
Abbeville,2018,11.557354,11.3,78.895876,83.0,15.426622,19.81
Abbeville,2019,11.505668,10.9,78.417726,82.0,16.245584,19.98
Abbeville,2020,12.129211,11.6,78.498121,83.0,16.94329,20.0
Abbeville,2022,12.040569,11.65,78.924897,84.0,16.538787,20.0
Abbeville,2023,12.855983,12.8,80.744099,84.0,16.519651,19.98
Ajaccio,2017,16.085788,15.6,70.32296,71.0,34.204374,30.0
Ajaccio,2018,16.807238,16.3,74.054889,76.0,31.980967,28.79
Ajaccio,2019,16.383946,15.8,70.415263,72.0,34.023393,32.35
Ajaccio,2020,16.369275,16.0,74.343814,75.0,34.345058,33.67


In [32]:
donnees = meteo.groupby( ['Annee']).agg({'Temperature':['mean','median'], 
                                                             'Humidite':['mean','median'], 
                                                             'Visibilite':['mean','median']})
donnees.head(28)

Unnamed: 0_level_0,Temperature,Temperature,Humidite,Humidite,Visibilite,Visibilite
Unnamed: 0_level_1,mean,median,mean,median,mean,median
Annee,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2017,12.809764,12.6,74.267394,78.0,25.681198,20.0
2018,13.377477,12.9,75.278017,79.0,24.960805,20.0
2019,13.185723,12.6,73.550953,77.0,25.78639,20.0
2020,13.526039,12.9,74.193164,78.0,26.922269,20.0
2022,13.956919,13.7,73.124933,77.0,25.500373,20.0
2023,14.789073,14.8,73.160243,76.0,25.974885,20.0


In [33]:
donnees = meteo.groupby(['JourA']).agg({'Temperature':'mean', 
                                         'Humidite':'mean', 
                                         'Visibilite':'mean'})
donnees.head(28)

Unnamed: 0_level_0,Temperature,Humidite,Visibilite
JourA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6.966796,84.595607,18.767036
2,7.228233,83.786638,21.666356
3,7.28778,82.919966,20.207028
4,7.060017,82.380482,23.088434
5,6.114243,80.605852,24.697945
6,4.945611,81.726334,21.831354
7,6.140698,82.458226,19.741284
8,7.558143,85.393521,19.996774
9,7.329602,81.6946,23.603863
10,6.653129,82.484247,25.397236


In [None]:
donnees.isna().sum()

In [51]:
donnees2 = meteo.groupby(['Nom','JourA']).agg({'Temperature':['mean','min','max','std'], 
                                         'Humidite':'mean', 
                                         'Visibilite':'mean'}).reset_index()
donnees2[donnees2['Nom'] == 'Abbeville']
meteo

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,DirectionVent,VitesseVent,Temperature,Humidite,Visibilite,Pression,Mois,Annee,AnneeMois,AnneeJour,Semaine,MoisJour,JourA,Precipitation
Nom,Latitude,Longitude,Altitude,Zone,DateHeure,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Abbeville,50.136000,1.834000,69,NO,2017-01-01 00:00:00,0.0,0.0,-3.9,96.0,1.74,1018.0,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136000,1.834000,69,NO,2017-01-01 03:00:00,0.0,0.0,-5.1,94.0,7.36,1015.8,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136000,1.834000,69,NO,2017-01-01 06:00:00,0.0,0.0,-4.1,96.0,3.50,1013.1,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136000,1.834000,69,NO,2017-01-01 09:00:00,0.0,0.0,-2.2,97.0,4.00,1011.7,1,2017,201701,2017001,52,101,1,0.0
Abbeville,50.136000,1.834000,69,NO,2017-01-01 12:00:00,0.0,0.0,-0.9,98.0,4.00,1010.3,1,2017,201701,2017001,52,101,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bastia,42.540667,9.485167,10,SE,2023-10-31 09:00:00,250.0,6.9,22.5,31.0,20.55,1007.1,10,2023,202310,2023031,44,1031,304,0.0
Bastia,42.540667,9.485167,10,SE,2023-10-31 12:00:00,140.0,6.1,24.2,26.0,19.26,1007.7,10,2023,202310,2023031,44,1031,304,0.0
Bastia,42.540667,9.485167,10,SE,2023-10-31 15:00:00,190.0,2.0,23.5,30.0,32.84,1009.4,10,2023,202310,2023031,44,1031,304,0.0
Bastia,42.540667,9.485167,10,SE,2023-10-31 18:00:00,230.0,5.1,18.9,41.0,47.48,1011.1,10,2023,202310,2023031,44,1031,304,0.0


In [52]:
donnees3 = meteo.groupby(['Nom', 'AnneeJour']).agg({'Pression': ['min', 'max'], 'Humidite': ['min', 'max']})
donnees3

Unnamed: 0_level_0,Unnamed: 1_level_0,Pression,Pression,Humidite,Humidite
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,min,max
Nom,AnneeJour,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Abbeville,2017001,989.7,1018.0,44.0,99.0
Abbeville,2017002,990.9,1020.9,46.0,99.0
Abbeville,2017003,985.1,1021.2,53.0,99.0
Abbeville,2017004,982.6,1027.3,51.0,99.0
Abbeville,2017005,984.2,1028.6,38.0,97.0
...,...,...,...,...,...
Troyes,2023027,979.8,1013.6,29.0,98.0
Troyes,2023028,983.8,1016.2,35.0,98.0
Troyes,2023029,983.0,1017.2,29.0,96.0
Troyes,2023030,983.3,1015.3,37.0,97.0
