In [11]:
import requests
import pandas as pd
from pandas import json_normalize
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
from datetime import datetime, timedelta
import glob
import plotly.graph_objects as go
from meteofrance_api import MeteoFranceClient
pd.set_option('display.max_columns', None)

%matplotlib inline
# Set notebook mode to work in offline
pio.renderers.default = 'notebook'

## Données Méteos

In [2]:

file_path = "../coord_stations_exploitables.txt"

df = pd.read_csv(file_path, sep=',', names=['Latitude', 'Longitude'])

# Initialisation du client MeteoFrance
client = MeteoFranceClient()

def get_precipitation(lat, lon, client):
    try:
        obs = client.get_rain(lat, lon)
        forecast = obs.forecast
        total_rain = sum(item['rain'] for item in forecast)
        avg = total_rain / len(forecast)
        return forecast, forecast[0]['rain']        

    except Exception as e:
        print(f"Erreur lors de la récupération des données météorologiques : {e}")
        return [], 0

df['Forecast'], df['Precipitation'] = zip(
    *df.apply(lambda row: get_precipitation(row['Latitude'], row['Longitude'], client), axis=1))


In [3]:
df

Unnamed: 0,Latitude,Longitude,Forecast,Precipitation
0,47.564377,7.528982,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
1,48.221808,7.647875,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
2,48.296112,7.654202,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
3,48.592540,7.802871,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
4,47.488084,7.392949,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
...,...,...,...,...
2406,42.229485,9.437795,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
2407,42.103244,9.260356,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
2408,41.863776,9.370376,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1
2409,41.706231,9.333949,"[{'dt': 1701040200, 'rain': 1, 'desc': 'Temps ...",1


In [12]:
# Remplacez ceci par votre clé API réelle de Meteo Concept
api_key = 'a7be77d473eb61eaa1f17fc9f6b3a1797ae5f6d1cd030b8cf658ba2ab74f7ea5'

# Liste des points géographiques (latitude, longitude)
points = [
    (48.8566, 2.3522),  # Paris
    (43.6045, 1.4442),  # Toulouse
    (45.7640, 4.8357),  # Lyon
    # Ajoutez d'autres points au besoin
]

# Fonction pour récupérer la météo pour un point donné
def get_weather(lat, lon):
    url = f"https://api.meteo-concept.com/api/forecast/daily?token={api_key}&latlng={lat},{lon}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Récupérer la météo pour chaque point
weather_data = [get_weather(lat, lon) for lat, lon in points]

dfs = []
def add_to_dataframe(data):
    forecast_data = data['forecast']
    df = json_normalize(forecast_data)
    df['city_name'] = data['city']['name']  # Ajoute le nom de la ville
    dfs.append(df)

# Afficher les données météorologiques
for data in weather_data:
    if data:
        add_to_dataframe(data)        
    else:
        print("Erreur lors de la récupération des données météorologiques")

final_df = pd.concat(dfs, ignore_index=True)
final_df


Unnamed: 0,insee,cp,latitude,longitude,day,datetime,wind10m,gust10m,dirwind10m,rr10,rr1,probarain,weather,tmin,tmax,sun_hours,etp,probafrost,probafog,probawind70,probawind100,gustx,city_name
0,75107,75007,48.8561,2.3124,0,2023-12-04T01:00:00+0100,20,50,177,7.4,14.1,100,210,4,9,0,1,10,0,0,0,64,Paris 7e Arrondissement
1,75107,75007,48.8561,2.3124,1,2023-12-05T01:00:00+0100,20,51,242,5.8,11.0,100,40,6,7,1,0,0,0,0,0,67,Paris 7e Arrondissement
2,75107,75007,48.8561,2.3124,2,2023-12-06T01:00:00+0100,10,23,295,0.5,0.5,50,4,6,7,2,0,30,20,0,0,33,Paris 7e Arrondissement
3,75107,75007,48.8561,2.3124,3,2023-12-07T01:00:00+0100,20,32,183,2.5,2.5,60,10,4,8,2,0,30,0,10,0,32,Paris 7e Arrondissement
4,75107,75007,48.8561,2.3124,4,2023-12-08T01:00:00+0100,15,33,200,4.2,11.0,70,10,7,10,2,1,10,0,10,0,48,Paris 7e Arrondissement
5,75107,75007,48.8561,2.3124,5,2023-12-09T01:00:00+0100,15,29,209,3.4,7.2,60,41,6,9,2,0,10,0,10,0,42,Paris 7e Arrondissement
6,75107,75007,48.8561,2.3124,6,2023-12-10T01:00:00+0100,25,40,220,9.0,18.0,60,41,8,10,1,1,0,0,10,0,54,Paris 7e Arrondissement
7,75107,75007,48.8561,2.3124,7,2023-12-11T01:00:00+0100,20,36,219,7.2,16.2,60,41,8,10,2,1,0,0,0,0,51,Paris 7e Arrondissement
8,75107,75007,48.8561,2.3124,8,2023-12-12T01:00:00+0100,20,33,212,4.4,6.4,60,40,8,10,2,0,0,0,0,0,40,Paris 7e Arrondissement
9,75107,75007,48.8561,2.3124,9,2023-12-13T01:00:00+0100,15,29,204,7.2,7.2,60,10,7,9,2,0,10,0,0,0,39,Paris 7e Arrondissement


## Données Stations

In [14]:
url_stations = "http://hubeau.eaufrance.fr/api/v1/hydrometrie/referentiel/stations"

response_stations = requests.get(url_stations, params={"size": 6000})
if response_stations.status_code == '200' or '206': # status codes are detailed on the website
    data_stations = response_stations.json()
    data_stations = pd.DataFrame(data_stations["data"])
else:
    print(f"La requête a échoué avec le code d'état {response_stations.status_code}")

# We check the names of the columns that contain at least 50% of null values
data_stations.isnull().sum()

# We remove the columns that contain at least 50% of null values
data_stations = data_stations.dropna(axis=1, thresh=int(0.5*len(data_stations)))

# We remove every row that is not in metropolitan france
regions_in_metropolitan_france = ['GRAND EST', 'HAUTS-DE-FRANCE', 'BOURGOGNE-FRANCHE-COMTE', 'ILE-DE-FRANCE', 'CENTRE-VAL DE LOIRE', 'NORMANDIE', 'BRETAGNE', 'PAYS DE LA LOIRE', 'AUVERGNE-RHONE-ALPES', 'OCCITANIE', 'NOUVELLE-AQUITAINE', "PROVENCE-ALPES-COTE D'AZUR", 'CORSE']
data_stations = data_stations[data_stations["libelle_region"].isin(regions_in_metropolitan_france)]

# Next, we remove every row where the corresponding station is not active
data_stations = data_stations[data_stations["en_service"] == True]

# We remove every row where the longitude is between -15 and 40
data_stations = data_stations[(data_stations["longitude_station"] > -15) & (data_stations["longitude_station"] < 40)]

print("Il y a",len(data_stations),"stations en fonctionnement.")

Il y a 3462 stations en fonctionnement.


In [15]:
dossier = '../data_limites'

fichiers_csv = glob.glob(dossier + '/data*')

codes_sites_A = []
for fichier in fichiers_csv:
    codes_sites_A.append(fichier.split("_")[-1])

print("Il y a",len(codes_sites_A)," stations pour lesquelles on connait la moyenne des observations de débit des 25 dernières années.")

Il y a 2588  stations pour lesquelles on connait la moyenne des observations de débit des 25 dernières années.


In [16]:
# On récupère les seuils limites de hauteur

with open("../limites_H.txt",'r') as file:
    seuils_H = {}
    for line in file:
        cle, valeur = line.strip().split(':')        
        seuils_H[cle] = valeur

        
code_sites_H = list(seuils_H.keys())
print("Il y a",len(seuils_H),"stations pour lesquelles on a le seuil limite maximum (hauteur)")

Il y a 2778 stations pour lesquelles on a le seuil limite maximum (hauteur)


In [17]:
# On récupère les seuils limites de débit

with open("../limites_Q.txt",'r') as file:
    seuils_Q = {}
    for line in file:
        cle, valeur = line.strip().split(':')        
        seuils_Q[cle] = valeur

        
code_sites_Q = list(seuils_Q.keys())
print("Il y a",len(seuils_Q),"stations pour lesquelles on a le seuil limite maximum (débit)")

Il y a 2655 stations pour lesquelles on a le seuil limite maximum (débit)


In [18]:
data_stations_final = data_stations.loc[data_stations['code_station'].isin(codes_sites_A)] # Stations dont on a les observations passées
data_stations_final = data_stations_final.loc[data_stations_final['code_station'].isin(code_sites_H)] # Stations dont on a les seuils_H
data_stations_final = data_stations_final.loc[data_stations_final['code_station'].isin(code_sites_Q)] # Stations dont on a les seuils_Q
print("Il y a",len(data_stations_final)," stations exploitables")

Il y a 2407  stations exploitables


In [19]:
data_stations_final = data_stations_final[["code_site", "code_station", "longitude_station", "latitude_station", 
                                           "influence_locale_station", "code_commune_station", "code_departement", 
                                           "code_region", "code_cours_eau", "code_regime_station", 
                                           "qualification_donnees_station", "code_finalite_station"]]

In [25]:
data_1_out_of_5 = data_stations_final.iloc[::5,:]
fig = px.scatter_mapbox(data_1_out_of_5,lat='latitude_station',lon='longitude_station', zoom=4, center={'lat': 46.5, 'lon': 2.7274})
fig.update_layout(title='Stations exploitables', title_x=0.5, autosize=True, height=800,
                  mapbox_style='carto-positron')
fig.show()

#### On encode les variables catégoriques

In [14]:
from sklearn.preprocessing import LabelEncoder
# Créer une instance de LabelEncoder
label_encoder = LabelEncoder()

# Encoder la colonne 'code_site'
data_stations_final['code_site_encoded'] = label_encoder.fit_transform(data_stations_final['code_site'])

# Encoder la colonne 'code_station'
data_stations_final['code_station_encoded'] = label_encoder.fit_transform(data_stations_final['code_station'])

# Ecoder 'code_commune_station'
data_stations_final['code_cours_eau_encoded'] = label_encoder.fit_transform(data_stations_final['code_cours_eau'])

data_stations_final[['code_site','code_site_encoded']].head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,code_site,code_site_encoded
263,A0220200,0
275,A0410300,1
283,A0530742,2
288,A0610050,3
289,A1000030,4


In [15]:
data_stations_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5863
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site                      2411 non-null   object 
 1   code_station                   2411 non-null   object 
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2258 non-null   float64
 5   code_commune_station           2411 non-null   object 
 6   code_departement               2411 non-null   object 
 7   code_region                    2411 non-null   object 
 8   code_cours_eau                 2411 non-null   object 
 9   code_regime_station            2411 non-null   int64  
 10  qualification_donnees_station  2411 non-null   int64  
 11  code_finalite_station          2284 non-null   float64
 12  code_site_encoded              2411 non-null   int3

In [16]:
data_model_v0 = data_stations_final[["code_site_encoded", "code_station_encoded", "longitude_station", "latitude_station", 
                                           "influence_locale_station", "code_cours_eau_encoded",                                           
                                           "qualification_donnees_station", "code_finalite_station"]]

In [17]:
data_model_v0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5863
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site_encoded              2411 non-null   int32  
 1   code_station_encoded           2411 non-null   int32  
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2258 non-null   float64
 5   code_cours_eau_encoded         2411 non-null   int32  
 6   qualification_donnees_station  2411 non-null   int64  
 7   code_finalite_station          2284 non-null   float64
dtypes: float64(4), int32(3), int64(1)
memory usage: 141.3 KB


#### Pour combler les valeurs manquantes on applique un Random Forest Classifier

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

data_model_temp = data_model_v0.drop('code_finalite_station',axis=1).reset_index(drop=True)

nul_rows = data_model_temp[data_model_temp['influence_locale_station'].isna()]
other_rows = data_model_temp.drop(nul_rows.index)

X_train = other_rows.drop('influence_locale_station',axis=1).reset_index(drop=True)
y_train = other_rows['influence_locale_station']

X_test = nul_rows.drop('influence_locale_station',axis=1).reset_index(drop=True)

# Standardiser les caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer et entraîner le modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Faire des prédictions et évaluer le modèle
y_pred = model.predict(X_test_scaled)


In [19]:
data_model_v1 = data_model_v0.copy()
data_model_v1.loc[data_model_v1['influence_locale_station'].isna(), 'influence_locale_station'] = y_pred

In [20]:

nul_rows = data_model_v1[data_model_v1['code_finalite_station'].isna()]
other_rows = data_model_v1.drop(nul_rows.index)

X_train = other_rows.drop('code_finalite_station',axis=1).reset_index(drop=True)
y_train = other_rows['code_finalite_station']

X_test = nul_rows.drop('code_finalite_station',axis=1).reset_index(drop=True)

# Standardiser les caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer et entraîner le modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Faire des prédictions et évaluer le modèle
y_pred = model.predict(X_test_scaled)

In [21]:
data_model_v2 = data_model_v1.copy()
data_model_v2.loc[data_model_v2['code_finalite_station'].isna(), 'code_finalite_station'] = y_pred

In [22]:
data_model_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2411 entries, 263 to 5863
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   code_site_encoded              2411 non-null   int32  
 1   code_station_encoded           2411 non-null   int32  
 2   longitude_station              2411 non-null   float64
 3   latitude_station               2411 non-null   float64
 4   influence_locale_station       2411 non-null   float64
 5   code_cours_eau_encoded         2411 non-null   int32  
 6   qualification_donnees_station  2411 non-null   int64  
 7   code_finalite_station          2411 non-null   float64
dtypes: float64(4), int32(3), int64(1)
memory usage: 205.8 KB


### Données Hydrométriques

In [45]:
date = datetime.now() - timedelta(hours=12)
date = date.strftime("%Y-%m-%dT%H:%M:%S")
date

'2023-11-26T12:26:29'

In [44]:
date2 = datetime(2023, 11, 26, 12, 0, 0)
date2 = date2.strftime("%Y-%m-%dT%H:%M:%S")
date2

'2023-11-27T01:00:00'

In [64]:
import json
import os

stations = data_stations_final['code_station'].tolist()
url = "http://hubeau.eaufrance.fr/api/v1/hydrometrie/observations_tr"

date = datetime.now() - timedelta(hours=12)
date = date.strftime("%Y-%m-%dT%H:%M:%S")

date2 = datetime.now()
date2 = date2.replace(year=date2.year - 1)
date2 = date2.strftime("%Y-%m-%dT%H:%M:%S")

nombre_requetes = 0

for i in range(0, len(stations), 25):
    params = {
        "code_entite": stations[i:i+25],
        "date_debut_obs": date2,
        "grandeur_hydro": ['Q', 'H'],
        "size": 20000
    }

    response = requests.get(url, params=params)
    nombre_requetes += 1
    print(f"Requete {nombre_requetes} : status code:", response.status_code)

    if response.status_code in [200, 206]:
        data = response.json()['data']

        for entry in data:
            # Adjust the timestamp to remove 'Z' if present
            timestamp_str = entry['date_obs'].rstrip('Z')
            
            # Parse the timestamp from the entry to get the year and month
            timestamp = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S")
            year = timestamp.year
            month = timestamp.month

            # Create directory for the year if it doesn't exist
            year_dir = f'./data/{year}'
            if not os.path.exists(year_dir):
                os.makedirs(year_dir)

            # Write data to the corresponding file
            file_path = f'{year_dir}/{month}.json'
            with open(file_path, 'a') as file:
                json.dump(entry, file)
                file.write('\n')  # Add a newline to separate entries
    else:
        print(f"La requête {nombre_requetes} a échoué avec le code d'état {response.status_code}")


Requete 1 : status code: 400
La requête 1 a échoué avec le code d'état 400
Requete 2 : status code: 400
La requête 2 a échoué avec le code d'état 400
Requete 3 : status code: 400
La requête 3 a échoué avec le code d'état 400
Requete 4 : status code: 400
La requête 4 a échoué avec le code d'état 400
Requete 5 : status code: 400
La requête 5 a échoué avec le code d'état 400
Requete 6 : status code: 400
La requête 6 a échoué avec le code d'état 400
Requete 7 : status code: 400
La requête 7 a échoué avec le code d'état 400
Requete 8 : status code: 400
La requête 8 a échoué avec le code d'état 400
Requete 9 : status code: 400
La requête 9 a échoué avec le code d'état 400
Requete 10 : status code: 400
La requête 10 a échoué avec le code d'état 400
Requete 11 : status code: 400
La requête 11 a échoué avec le code d'état 400


KeyboardInterrupt: 