In [3]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
from datetime import datetime, timedelta
import glob
import plotly.graph_objects as go
from meteofrance_api import MeteoFranceClient

%matplotlib inline
# Set notebook mode to work in offline
pio.renderers.default = 'iframe+pdf'

## Données Méteos

In [1]:

file_path = "../coord_stations_exploitables.txt"

df = pd.read_csv(file_path, sep=',', names=['Latitude', 'Longitude'])

# Initialisation du client MeteoFrance
client = MeteoFranceClient()

def get_precipitation(lat, lon, client):
    try:
        obs = client.get_rain(lat, lon)
        forecast = obs.forecast
        total_rain = sum(item['rain'] for item in forecast)
        avg = total_rain / len(forecast)
        return forecast, forecast[0]['rain']        

    except Exception as e:
        print(f"Erreur lors de la récupération des données météorologiques : {e}")
        return [], 0

df['Forecast'], df['Precipitation'] = zip(
    *df.apply(lambda row: get_precipitation(row['Latitude'], row['Longitude'], client), axis=1))


In [2]:
df

Unnamed: 0,Latitude,Longitude,Forecast,Precipitation
0,47.564377,7.528982,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
1,48.221808,7.647875,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
2,48.296112,7.654202,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
3,48.592540,7.802871,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
4,47.488084,7.392949,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
...,...,...,...,...
2406,42.229485,9.437795,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
2407,42.103244,9.260356,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
2408,41.863776,9.370376,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1
2409,41.706231,9.333949,"[{'dt': 1700840400, 'rain': 1, 'desc': 'Temps ...",1


## Données Stations

In [4]:
url_stations = "http://hubeau.eaufrance.fr/api/v1/hydrometrie/referentiel/stations"

response_stations = requests.get(url_stations, params={"size": 6000})
if response_stations.status_code == '200' or '206': # status codes are detailed on the website
    data_stations = response_stations.json()
    data_stations = pd.DataFrame(data_stations["data"])
else:
    print(f"La requête a échoué avec le code d'état {response_stations.status_code}")

# We check the names of the columns that contain at least 50% of null values
data_stations.isnull().sum()

# We remove the columns that contain at least 50% of null values
data_stations = data_stations.dropna(axis=1, thresh=int(0.5*len(data_stations)))

# We remove every row that is not in metropolitan france
regions_in_metropolitan_france = ['GRAND EST', 'HAUTS-DE-FRANCE', 'BOURGOGNE-FRANCHE-COMTE', 'ILE-DE-FRANCE', 'CENTRE-VAL DE LOIRE', 'NORMANDIE', 'BRETAGNE', 'PAYS DE LA LOIRE', 'AUVERGNE-RHONE-ALPES', 'OCCITANIE', 'NOUVELLE-AQUITAINE', "PROVENCE-ALPES-COTE D'AZUR", 'CORSE']
data_stations = data_stations[data_stations["libelle_region"].isin(regions_in_metropolitan_france)]

# Next, we remove every row where the corresponding station is not active
data_stations = data_stations[data_stations["en_service"] == True]

# We remove every row where the longitude is between -15 and 40
data_stations = data_stations[(data_stations["longitude_station"] > -15) & (data_stations["longitude_station"] < 40)]

print("Il y a",len(data_stations),"stations en fonctionnement.")

Il y a 3466 stations en fonctionnement.


In [5]:
dossier = '../data_limites'

fichiers_csv = glob.glob(dossier + '/data*')

codes_sites_A = []
for fichier in fichiers_csv:
    codes_sites_A.append(fichier.split("_")[-1])

print("Il y a",len(codes_sites_A)," stations pour lesquelles on connait la moyenne des observations de débit des 25 dernières années.")

Il y a 2588  stations pour lesquelles on connait la moyenne des observations de débit des 25 dernières années.


In [6]:
# On récupère les seuils limites de hauteur

with open("../limites_H.txt",'r') as file:
    seuils_H = {}
    for line in file:
        cle, valeur = line.strip().split(':')        
        seuils_H[cle] = valeur

        
code_sites_H = list(seuils_H.keys())
print("Il y a",len(seuils_H),"stations pour lesquelles on a le seuil limite maximum (hauteur)")

Il y a 2778 stations pour lesquelles on a le seuil limite maximum (hauteur)


In [7]:
# On récupère les seuils limites de débit

with open("../limites_Q.txt",'r') as file:
    seuils_Q = {}
    for line in file:
        cle, valeur = line.strip().split(':')        
        seuils_Q[cle] = valeur

        
code_sites_Q = list(seuils_Q.keys())
print("Il y a",len(seuils_Q),"stations pour lesquelles on a le seuil limite maximum (débit)")

Il y a 2655 stations pour lesquelles on a le seuil limite maximum (débit)


In [119]:
data_stations_final = data_stations.loc[data_stations['code_station'].isin(codes_sites_A)] # Stations dont on a les observations passées
data_stations_final = data_stations_final.loc[data_stations_final['code_station'].isin(code_sites_H)] # Stations dont on a les seuils_H
data_stations_final = data_stations_final.loc[data_stations_final['code_station'].isin(code_sites_Q)] # Stations dont on a les seuils_Q
print("Il y a",len(data_stations_final)," stations exploitables")

Il y a 2411  stations exploitables


In [120]:
data_stations_final = data_stations_final[["code_site", "code_station", "longitude_station", "latitude_station", 
                                           "influence_locale_station", "code_commune_station", "code_departement", 
                                           "code_region", "code_cours_eau", "code_regime_station", 
                                           "qualification_donnees_station", "code_finalite_station"]]

#### On encode les variables catégoriques

In [121]:
from sklearn.preprocessing import LabelEncoder
# Créer une instance de LabelEncoder
label_encoder = LabelEncoder()

# Encoder la colonne 'code_site'
data_stations_final['code_site_encoded'] = label_encoder.fit_transform(data_stations_final['code_site'])

# Encoder la colonne 'code_station'
data_stations_final['code_station_encoded'] = label_encoder.fit_transform(data_stations_final['code_station'])

# Ecoder 'code_commune_station'
data_stations_final['code_cours_eau_encoded'] = label_encoder.fit_transform(data_stations_final['code_cours_eau'])

data_stations_final[['code_site','code_site_encoded']].head()

Unnamed: 0,code_site,code_site_encoded
263,A0220200,0
275,A0410300,1
283,A0530742,2
288,A0610050,3
289,A1000030,4


In [122]:
data_stations_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5862
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site                      2411 non-null   object 
 1   code_station                   2411 non-null   object 
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2258 non-null   float64
 5   code_commune_station           2411 non-null   object 
 6   code_departement               2411 non-null   object 
 7   code_region                    2411 non-null   object 
 8   code_cours_eau                 2411 non-null   object 
 9   code_regime_station            2411 non-null   int64  
 10  qualification_donnees_station  2411 non-null   int64  
 11  code_finalite_station          2284 non-null   float64
 12  code_site_encoded              2411 non-null   int3

In [123]:
data_model_v0 = data_stations_final[["code_site_encoded", "code_station_encoded", "longitude_station", "latitude_station", 
                                           "influence_locale_station", "code_cours_eau_encoded",                                           
                                           "qualification_donnees_station", "code_finalite_station"]]

In [124]:
data_model_v0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5862
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site_encoded              2411 non-null   int32  
 1   code_station_encoded           2411 non-null   int32  
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2258 non-null   float64
 5   code_cours_eau_encoded         2411 non-null   int32  
 6   qualification_donnees_station  2411 non-null   int64  
 7   code_finalite_station          2284 non-null   float64
dtypes: float64(4), int32(3), int64(1)
memory usage: 141.3 KB


#### Pour combler les valeurs manquantes on applique un Random Forest Classifier

In [127]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

data_model_temp = data_model_v0.drop('code_finalite_station',axis=1).reset_index(drop=True)

nul_rows = data_model_temp[data_model_temp['influence_locale_station'].isna()]
other_rows = data_model_temp.drop(nul_rows.index)

X_train = other_rows.drop('influence_locale_station',axis=1).reset_index(drop=True)
y_train = other_rows['influence_locale_station']

X_test = nul_rows.drop('influence_locale_station',axis=1).reset_index(drop=True)

# Standardiser les caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer et entraîner le modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Faire des prédictions et évaluer le modèle
y_pred = model.predict(X_test_scaled)


In [128]:
data_model_v1 = data_model_v0.copy()
data_model_v1.loc[data_model_v1['influence_locale_station'].isna(), 'influence_locale_station'] = y_pred

In [135]:

nul_rows = data_model_v1[data_model_v1['code_finalite_station'].isna()]
other_rows = data_model_v1.drop(nul_rows.index)

X_train = other_rows.drop('code_finalite_station',axis=1).reset_index(drop=True)
y_train = other_rows['code_finalite_station']

X_test = nul_rows.drop('code_finalite_station',axis=1).reset_index(drop=True)

# Standardiser les caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer et entraîner le modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Faire des prédictions et évaluer le modèle
y_pred = model.predict(X_test_scaled)

In [131]:
data_model_v2 = data_model_v1.copy()
data_model_v2.loc[data_model_v2['code_finalite_station'].isna(), 'code_finalite_station'] = y_pred

In [136]:
data_model_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5862
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site_encoded              2411 non-null   int32  
 1   code_station_encoded           2411 non-null   int32  
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2411 non-null   float64
 5   code_cours_eau_encoded         2411 non-null   int32  
 6   qualification_donnees_station  2411 non-null   int64  
 7   code_finalite_station          2411 non-null   float64
dtypes: float64(4), int32(3), int64(1)
memory usage: 205.8 KB


### Données Hydrométriques

In [145]:
import json

url = "http://hubeau.eaufrance.fr/api/v1/hydrometrie/observations_tr"
stations = data_stations_final['code_station'].tolist()

date2 = datetime(2018, 11, 24, 0, 0, 0) # On récupères toutes les observations des 5 dernières années
date = datetime.now()-timedelta(hours=12)
date = date.strftime("%Y-%m-%dT%H:%M:%S")

data_hydro = pd.DataFrame()
nombre_requetes = 0

for i in range(0,len(stations),75):
    params = {
        "code_entite": stations[i:i+75], # station code    
        "date_debut_obs": date,        
        "grandeur_hydro": ['Q','H'], # hydrometric variables choosen
        "size": 20000        
    }
    
    response = requests.get(url, params=params)

    nombre_requetes+=1
    print(f"Requete {nombre_requetes} : status code:",response.status_code)
    
    if response.status_code == (200 or 206):
        data = response.json()['data']
        
        with open('historic_data_hydro.json', 'r+') as file:
            try:
                file_data = json.load(file)
            except json.JSONDecodeError:
                file_data = []

            file_data.extend(data)
            file.seek(0)            
            json.dump(file_data, file, indent=4)
            
    else:
        print(f"La requête {nombre_requetes} a échoué avec le code d'état {response.status_code}")

Requete 1 : status code: 200
Requete 2 : status code: 200


KeyboardInterrupt: 