<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Les-imports-et-configuration-du-document" data-toc-modified-id="Les-imports-et-configuration-du-document-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Les imports et configuration du document</a></span><ul class="toc-item"><li><span><a href="#La-lecture-des-fichiers-d'un-répértoire" data-toc-modified-id="La-lecture-des-fichiers-d'un-répértoire-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>La lecture des fichiers d'un répértoire</a></span></li></ul></li><li><span><a href="#La-temperature-mensuelle" data-toc-modified-id="La-temperature-mensuelle-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>La temperature mensuelle</a></span></li><li><span><a href="#La-météo-mensuelle" data-toc-modified-id="La-météo-mensuelle-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>La météo mensuelle</a></span></li><li><span><a href="#La-météo-----hebdomadaire" data-toc-modified-id="La-météo-----hebdomadaire-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>La météo     hebdomadaire</a></span></li><li><span><a href="#La-météo-----journalière" data-toc-modified-id="La-météo-----journalière-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>La météo     journalière</a></span></li></ul></div>

# Les imports et configuration du document

In [21]:
import pandas as pd 
import seaborn as sns
import warnings 
import os
import time
from datetime import datetime
from matplotlib import pyplot as plt

warnings.filterwarnings(action="ignore")

%matplotlib inline
if int(str(sns.__version__).split('.')[1]) > 8 : 
    plt.style.use('seaborn-v0_8-darkgrid')
else:
    plt.style.use('seaborn-darkgrid')
    
sns.set(font_scale=3)

# os.chdir("donnees")

In [22]:
def tempsExecution(func):
    def chronometrerExécution(*_args, **_kwargs): 
        t0 = time.perf_counter()
        result = func(*_args, **_kwargs) 
        elapsed = time.perf_counter() - t0
        name = func.__name__
        args = ', '.join(repr(arg) for arg in _args) \
               if len(_args) == 1 \
               else ', '.join(arg+'='+repr(_kwargs[arg]) for arg in _kwargs)
        print(f'exécution {elapsed:0.8f}s\nfonction {name}({args})')
        return result
    return chronometrerExécution

In [24]:
@tempsExecution
def lectureFichiersMeteo(repertoire='../donnees/meteo_test'):
    meteo = pd.DataFrame()

    for dirname, _, filenames in os.walk(repertoire):
        for filename in filenames:
            # if filename.rfind('synop') >= 0 :
            meteo = pd.concat([meteo,
                               pd.read_csv(os.path.join(dirname, filename),
                                           sep=';',
                                           usecols=['numer_sta','date','dd','ff','t','u','vv',
                                                    'pres','rr1','rr3','rr6','rr12','rr24'],
                                           na_values='mq',
                                           dtype={'numer_sta':str,'date':str},
                                          ).rename( columns={'numer_sta':'Station',
                                                              'date':'DateHeure',
                                                              'dd':'DirectionVent',
                                                              'ff':'VitesseVent',
                                                              't':'Temperature',
                                                              'u':'Humidite',
                                                              'vv':'Visibilite',
                                                              'pres':'Pression',
                                                              'rr1' :'Precipitation01',
                                                              'rr3' :'Precipitation03',
                                                              'rr6' :'Precipitation06',
                                                              'rr12':'Precipitation12',
                                                              'rr24':'Precipitation24'})],axis=0)
    meteo.reset_index().drop(columns='index',inplace=True)
    meteo["DateHeure"] = pd.to_datetime(meteo["DateHeure"], format='%Y%m%d%H%M%S')
    meteo.Temperature  = meteo.Temperature - 273.15
    meteo.Pression     = meteo.Pression / 100
    meteo.Visibilite   = meteo.Visibilite / 1000
    meteo['Mois']      = meteo.DateHeure.dt.month
    meteo['Annee']     = meteo.DateHeure.dt.year
    meteo['AnneeMois'] = meteo.DateHeure.dt.year*100 + meteo.DateHeure.dt.month
    meteo['Semaine']   = meteo.DateHeure.dt.isocalendar().week
    meteo['MoisJour']  = meteo.DateHeure.dt.month*100 + meteo.DateHeure.dt.day
    meteo['JourA']     = meteo.DateHeure.dt.dayofyear
    return meteo

In [25]:
_ = lectureFichiersMeteo(repertoire='../donnees/meteo')

exécution 3.53778200s
fonction lectureFichiersMeteo(repertoire='../donnees/meteo')


In [26]:
@tempsExecution
def lectureFichiersMeteo(repertoire='../donnees/meteo_test'):
    listeFichiers = []
    for dirname, dirnames, filenames in os.walk(repertoire):
        for filename in filenames:
            listeFichiers.append(os.path.join(dirname, filename))
        
    meteo = pd.concat([ pd.read_csv(fichier,
                           sep=';',
                           usecols=['numer_sta','date','dd','ff','t','u','vv',
                                    'pres','rr1','rr3','rr6','rr12','rr24'],
                           na_values='mq',
                           dtype={'numer_sta':str,'date':str},
                          ).rename( columns={'numer_sta':'Station',
                                              'date':'DateHeure',
                                              'dd':'DirectionVent',
                                              'ff':'VitesseVent',
                                              't':'Temperature',
                                              'u':'Humidite',
                                              'vv':'Visibilite',
                                              'pres':'Pression',
                                              'rr1' :'Precipitation01',
                                              'rr3' :'Precipitation03',
                                              'rr6' :'Precipitation06',
                                              'rr12':'Precipitation12',
                                              'rr24':'Precipitation24'}) 
                         for fichier in listeFichiers])
    meteo.reset_index().drop(columns='index',inplace=True)
    meteo["DateHeure"] = pd.to_datetime(meteo["DateHeure"], format='%Y%m%d%H%M%S')
    meteo.Temperature  = meteo.Temperature - 273.15
    meteo.Pression     = meteo.Pression / 100
    meteo.Visibilite   = meteo.Visibilite / 1000
    meteo['Mois']      = meteo.DateHeure.dt.month
    meteo['Annee']     = meteo.DateHeure.dt.year
    meteo['AnneeMois'] = meteo.DateHeure.dt.year*100 + meteo.DateHeure.dt.month
    meteo['Semaine']   = meteo.DateHeure.dt.isocalendar().week
    meteo['MoisJour']  = meteo.DateHeure.dt.month*100 + meteo.DateHeure.dt.day
    meteo['JourA']     = meteo.DateHeure.dt.dayofyear
    return meteo
    

In [27]:
_ = lectureFichiersMeteo(repertoire='../donnees/meteo')

exécution 2.84314090s
fonction lectureFichiersMeteo(repertoire='../donnees/meteo')


In [28]:
@tempsExecution
def lectureFichiersMeteo(repertoire='../donnees/meteo_test'):
    meteo = pd.concat([ pd.read_csv(fichier,
                           sep=';',
                           usecols=['numer_sta','date','dd','ff','t','u','vv',
                                    'pres','rr1','rr3','rr6','rr12','rr24'],
                           na_values='mq',
                           dtype={'numer_sta':str,'date':str},
                          ).rename( columns={'numer_sta':'Station',
                                              'date':'DateHeure',
                                              'dd':'DirectionVent',
                                              'ff':'VitesseVent',
                                              't':'Temperature',
                                              'u':'Humidite',
                                              'vv':'Visibilite',
                                              'pres':'Pression',
                                              'rr1' :'Precipitation01',
                                              'rr3' :'Precipitation03',
                                              'rr6' :'Precipitation06',
                                              'rr12':'Precipitation12',
                                              'rr24':'Precipitation24'}) 
                         for fichier in [ f'{dirname}{os.sep}{filename}' for dirname, dirnames, filenames in os.walk(repertoire) 
                                                                         for filename in filenames]])
    meteo.reset_index().drop(columns='index',inplace=True)
    meteo["DateHeure"] = pd.to_datetime(meteo["DateHeure"], format='%Y%m%d%H%M%S')
    meteo.Temperature  = meteo.Temperature - 273.15
    meteo.Pression     = meteo.Pression / 100
    meteo.Visibilite   = meteo.Visibilite / 1000
    meteo['Mois']      = meteo.DateHeure.dt.month
    meteo['Annee']     = meteo.DateHeure.dt.year
    meteo['AnneeMois'] = meteo.DateHeure.dt.year*100 + meteo.DateHeure.dt.month
    meteo['Semaine']   = meteo.DateHeure.dt.isocalendar().week
    meteo['MoisJour']  = meteo.DateHeure.dt.month*100 + meteo.DateHeure.dt.day
    meteo['JourA']     = meteo.DateHeure.dt.dayofyear
    return meteo
    

In [29]:
_ = lectureFichiersMeteo(repertoire='../donnees/meteo')

exécution 2.64951950s
fonction lectureFichiersMeteo(repertoire='../donnees/meteo')


## La lecture des fichiers d'un répértoire 

In [30]:
%%time
meteo = lectureFichiersMeteo(repertoire='../donnees/meteo')

exécution 2.80561460s
fonction lectureFichiersMeteo(repertoire='../donnees/meteo')
CPU times: total: 2.72 s
Wall time: 2.81 s


In [31]:
meteo.tail()

Unnamed: 0,Station,DateHeure,DirectionVent,VitesseVent,Temperature,Humidite,Visibilite,Pression,Precipitation01,Precipitation03,Precipitation06,Precipitation12,Precipitation24,Mois,Annee,AnneeMois,Semaine,MoisJour,JourA
14754,81401,2023-10-31 21:00:00,40.0,1.3,30.1,69.0,,1007.2,0.0,0.0,0.0,0.2,0.2,10,2023,202310,44,1031,304
14755,81405,2023-10-31 21:00:00,80.0,4.5,29.8,75.0,48.32,1007.5,0.0,0.0,0.0,0.0,0.4,10,2023,202310,44,1031,304
14756,81408,2023-10-31 21:00:00,60.0,3.0,30.5,67.0,,1006.9,0.0,0.0,0.0,0.0,0.0,10,2023,202310,44,1031,304
14757,81415,2023-10-31 21:00:00,50.0,2.8,32.5,57.0,,,0.0,0.0,0.0,0.0,0.0,10,2023,202310,44,1031,304
14758,89642,2023-10-31 21:00:00,90.0,3.1,-7.5,60.0,,949.0,,,,,,10,2023,202310,44,1031,304


In [32]:
meteo.shape

(1172478, 19)

In [33]:
postes = pd.read_csv('../donnees/postesSynop.csv',sep=';',dtype={'ID':str})
postes.Nom =postes.Nom.apply(lambda x : x if x in ['CLERMONT-FD','MONT-DE-MARSAN',
                                       'ST-PIERRE','ST-BARTHELEMY METEO'] 
                               else x[0:x.find('-')] 
                                    if x.find('-') != -1 else x).apply(lambda x : str(x).title())
postes.Altitude = postes.Altitude.astype('int16')
postes = postes[postes.ID < '08000']

In [34]:
postes.loc[postes.Latitude  < postes.Latitude.mean(),'Zone'] = 'S'
postes.loc[postes.Latitude  > postes.Latitude.mean(),'Zone'] = 'N'
postes.loc[postes.Longitude < postes.Longitude.mean(),'Zone'] += 'O'
postes.loc[postes.Longitude > postes.Longitude.mean(),'Zone'] += 'E'

In [37]:
postes.head()

Unnamed: 0,ID,Nom,Latitude,Longitude,Altitude,Zone
0,7005,Abbeville,50.136,1.834,69,NO
1,7015,Lille,50.57,3.0975,47,NE
2,7020,Pte De La Hague,49.725167,-1.939833,6,NO
3,7027,Caen,49.18,-0.456167,67,NO
4,7037,Rouen,49.383,1.181667,151,NO


In [38]:
meteo = postes.merge(meteo, how = "inner", 
                     left_on = "ID", right_on = "Station").drop(["ID","Station"], axis = "columns")

In [39]:
meteo['Precipitation'] =  meteo['Precipitation03'].combine_first(meteo['Precipitation06']/2)\
                                                  .combine_first(meteo['Precipitation12']/4)\
                                                  .combine_first(meteo['Precipitation24']/8)\
                                                  .combine_first(meteo['Precipitation01']*3)

In [40]:
meteo.drop(columns=['Precipitation06',
                    'Precipitation12',
                    'Precipitation24',
                    'Precipitation01',
                    'Precipitation03'], inplace=True)

In [42]:
meteo.to_parquet('../donnees/meteo.gzip',compression='gzip', engine='pyarrow')

In [43]:
!ls -al ../donnees/meteo.gzip

'ls' n’est pas reconnu en tant que commande interne
ou externe, un programme exécutable ou un fichier de commandes.


In [44]:
meteo.to_csv('../donnees/meteo.csv')

In [45]:
%%time
meteo = pd.read_parquet('../donnees/meteo.gzip', engine='pyarrow')
meteo.dtypes

CPU times: total: 328 ms
Wall time: 3.1 s


Nom                      object
Latitude                float64
Longitude               float64
Altitude                  int16
Zone                     object
DateHeure        datetime64[ns]
DirectionVent           float64
VitesseVent             float64
Temperature             float64
Humidite                float64
Visibilite              float64
Pression                float64
Mois                      int32
Annee                     int32
AnneeMois                 int32
Semaine                  UInt32
MoisJour                  int32
JourA                     int32
Precipitation           float64
dtype: object

In [None]:
%%time
pd.read_csv('../donnees/meteo.csv').dtypes

In [46]:
meteo.isna().sum()

Nom                  0
Latitude             0
Longitude            0
Altitude             0
Zone                 0
DateHeure            0
DirectionVent     2698
VitesseVent       2600
Temperature      11888
Humidite         12486
Visibilite       47161
Pression         10886
Mois                 0
Annee                0
AnneeMois            0
Semaine              0
MoisJour             0
JourA                0
Precipitation    13016
dtype: int64

In [47]:
meteo.head()

Unnamed: 0,Nom,Latitude,Longitude,Altitude,Zone,DateHeure,DirectionVent,VitesseVent,Temperature,Humidite,Visibilite,Pression,Mois,Annee,AnneeMois,Semaine,MoisJour,JourA,Precipitation
0,Abbeville,50.136,1.834,69,NO,2017-01-01 00:00:00,0.0,0.0,-3.9,96.0,1.74,1018.0,1,2017,201701,52,101,1,0.0
1,Abbeville,50.136,1.834,69,NO,2017-01-01 03:00:00,0.0,0.0,-5.1,94.0,7.36,1015.8,1,2017,201701,52,101,1,0.0
2,Abbeville,50.136,1.834,69,NO,2017-01-01 06:00:00,0.0,0.0,-4.1,96.0,3.5,1013.1,1,2017,201701,52,101,1,0.0
3,Abbeville,50.136,1.834,69,NO,2017-01-01 09:00:00,0.0,0.0,-2.2,97.0,4.0,1011.7,1,2017,201701,52,101,1,0.0
4,Abbeville,50.136,1.834,69,NO,2017-01-01 12:00:00,0.0,0.0,-0.9,98.0,4.0,1010.3,1,2017,201701,52,101,1,0.0


In [48]:
meteo.columns

Index(['Nom', 'Latitude', 'Longitude', 'Altitude', 'Zone', 'DateHeure',
       'DirectionVent', 'VitesseVent', 'Temperature', 'Humidite', 'Visibilite',
       'Pression', 'Mois', 'Annee', 'AnneeMois', 'Semaine', 'MoisJour',
       'JourA', 'Precipitation'],
      dtype='object')

# La temperature mensuelle

In [None]:
temperatures = meteo[['Nom', 'Latitude', 'Longitude', 'Altitude','Zone', 
                'Temperature','Mois']].pivot_table(index=['Nom', 'Latitude', 'Longitude', 'Altitude','Zone'],
                           columns='Mois')
temperatures.columns = ['janv.','févr.','mars','avr.','mai','juin','juill.','août','sept.','oct.','nov.','déc.']
temperatures.head()

In [None]:
temperatures.to_parquet('../donnees/temperatures.gzip',compression='gzip', engine='pyarrow')

In [None]:
temperatures.to_csv('../donnees/temperatures.csv')

In [None]:
%%time
temperatures = pd.read_parquet('../donnees/temperatures.gzip', engine='pyarrow')
temperatures.dtypes

In [None]:
temperatures.head()

# La météo mensuelle

In [None]:
meteoM = meteo[['Nom', 'Latitude', 'Longitude', 'Altitude','Zone', 
                'VitesseVent', 'Temperature', 'Humidite', 
                'Visibilite','Pression','Precipitation', 
                'Mois']].pivot_table(index=['Nom', 'Latitude', 'Longitude', 'Altitude','Zone'],
                           columns='Mois')
meteoM.columns = [f'{x[1]:02d}{x[0]}' for x in meteoM.columns]
meteoM.head()

In [None]:
meteoM.to_parquet('../donnees/meteoM.gzip',compression='gzip', engine='pyarrow')

In [None]:
%%time
meteoM = pd.read_parquet('../donnees/meteoM.gzip', engine='pyarrow')
meteoM.dtypes

In [None]:
meteoM.head()

# La météo 	hebdomadaire

In [None]:
meteoH = meteo[['Nom', 'Latitude', 'Longitude', 'Altitude','Zone', 
                'VitesseVent', 'Temperature', 'Humidite', 
                'Visibilite','Pression','Precipitation', 
                'Semaine']].pivot_table(index=['Nom', 'Latitude', 'Longitude', 'Altitude','Zone'],
                           columns='Semaine')
meteoH.columns = [f'{x[1]:02d}{x[0]}' for x in meteoH.columns]
meteoH.head()

In [None]:
meteoH.isna().sum()[meteoH.isna().sum() > 0]

In [None]:
meteoH.to_parquet('../donnees/meteoH.gzip',compression='gzip', engine='pyarrow')

In [None]:
%%time
meteoH = pd.read_parquet('../donnees/meteoH.gzip', engine='pyarrow')
meteoH.dtypes

In [None]:
meteoH.head()

# La météo 	journalière

In [None]:
meteoJ = meteo[['Nom', 'Latitude', 'Longitude', 'Altitude','Zone', 
                'VitesseVent', 'Temperature', 'Humidite', 
                'Visibilite','Pression','Precipitation', 
                'MoisJour']].pivot_table(index=['Nom', 'Latitude', 'Longitude', 'Altitude','Zone'],
                           columns='MoisJour')
meteoJ.columns = [f'{x[1]:04d}{x[0]}' for x in meteoJ.columns]
meteoJ.head()

In [None]:
meteoJ.to_parquet('../donnees/meteoJ.gzip',compression='gzip', engine='pyarrow')

In [None]:
%%time
meteoJ = pd.read_parquet('../donnees/meteoJ.gzip', engine='pyarrow')
meteoJ.dtypes

In [None]:
meteoJ.head()

In [None]:
meteoJ.isna().sum()[meteoJ.isna().sum() > 0]