#### The purpose of this script is to extract ids, metadata and simplified info, such as locations of currently active hydro stations.

In [2]:
from dotenv import load_dotenv
import pandas as pd
from pandas.errors import EmptyDataError
import numpy as np
import matplotlib.pyplot as plt

from service.s3_storage_service import S3StorageService
from storage.s3_client import S3Client
import requests

load_dotenv() # Load environment variable with the bucket name. 

s3_client = S3Client().get_client()
s3_service = S3StorageService(s3_client)

In [3]:

base_url = "http://environment.data.gov.uk/hydrology/id/"

def get_active_stations(observed_property):
    """Retrieve active monitoring stations for a specific observed property (e.g., waterFlow or waterLevel)."""
    endpoint = f"{base_url}stations"
    params = {
        'status.label': 'Active',
        'observedProperty': observed_property,
        '_limit': 10000  # Increase limit as needed

    }
    response = requests.get(endpoint, params=params)
    return response.json()


def get_station_measures_by_guid(station_guid):
    """Retrieve available measures for a specific station."""
    endpoint = f"{base_url}measures"
    params = {
        'station': station_guid
    }
    response = requests.get(endpoint, params=params)
    data = response.json()
    return data

In [4]:
def get_active_stations_count(observed_property):
    """Retrieve counts of active monitoring stations with and without NRFA IDs for a specific observed property."""
    endpoint = f"{base_url}stations"
    params = {
        'status.label': 'Active',
        'observedProperty': observed_property,
        '_limit': 10000  # Assuming a high limit to fetch all relevant stations
    }
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        total_stations = len(data['items'])
        stations_with_nrfa = sum('nrfaStationID' in station for station in data['items'])
        stations_without_nrfa = total_stations - stations_with_nrfa

        return {
            "total_stations": total_stations,
            "stations_with_nrfa": stations_with_nrfa,
            "stations_without_nrfa": stations_without_nrfa
        }
    else:
        return f"Error: {response.status_code}"


observed_property = "waterLevel"  # or "waterFlow", etc.
station_counts = get_active_stations_count(observed_property)
print(station_counts)


{'total_stations': 2654, 'stations_with_nrfa': 672, 'stations_without_nrfa': 1982}


In [5]:
def get_station_info(id):
    endpoint = f"{base_url}stations/{id}"
    
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()
    else:
        return f"Error: {response.status_code}"

In [6]:
def create_id_map(data):
    id_map = []
    for item in data['items']:
        # Ensure 'stationGuid' exists and is a string (i.e., not a list or other type)
        if 'stationGuid' in item and isinstance(item['stationGuid'], str):
            entry = {
                "guid": item['stationGuid'],
                "nrfa": item.get('nrfaStationID', 'N/A'),  # Safe access with .get()
                "wiski": item.get('wiskiID', 'N/A')
            }
            id_map.append(entry)
        else:
            # Log or handle cases where 'stationGuid' is missing or not a string
            print(f"Skipping item due to non-standard 'stationGuid': {item}")
    return id_map



def get_guid_by_nrfa_id(nrfa_id, mapping):
    """
    Retrieve the GUID for a given NRFA ID from the mapping list.

    Parameters:
    - nrfa_id (str): The NRFA ID to search for.
    - mapping (list): The list containing mappings of GUID, NRFA, and Wiski IDs.

    Returns:
    - str: The GUID corresponding to the given NRFA ID, or None if not found.
    """
    nrfa_id_str = str(nrfa_id)
    for item in mapping:
        if item['nrfa'] == nrfa_id_str:
            return item['guid']
    return None

In [7]:
observed_property = "waterFlow" 
station_counts_water_flow = get_active_stations_count(observed_property)
print(station_counts_water_flow)

{'total_stations': 1077, 'stations_with_nrfa': 796, 'stations_without_nrfa': 281}


In [8]:
observed_property = "rainfall"
station_counts_water_flow = get_active_stations_count(observed_property)
print(station_counts_water_flow)

{'total_stations': 970, 'stations_with_nrfa': 0, 'stations_without_nrfa': 970}


In [9]:
observed_property = "groundwaterLevel"
station_counts_water_flow = get_active_stations_count(observed_property)
print(station_counts_water_flow)

{'total_stations': 3387, 'stations_with_nrfa': 0, 'stations_without_nrfa': 3387}


In [11]:
import json
active_stations_wl = get_active_stations('waterLevel')
station_ids_wl = create_id_map(active_stations_wl)

active_stations_flow = get_active_stations('waterFlow')
station_ids_wf = create_id_map(active_stations_flow)

active_stations_gw = get_active_stations('groundwaterLevel')
station_ids_gl = create_id_map(active_stations_gw)

active_stations_rf = get_active_stations('rainfall')
station_ids_rf = create_id_map(active_stations_gw)

Skipping item due to non-standard 'stationGuid': {'@id': 'http://environment.data.gov.uk/hydrology/id/stations/6c6c174e-519f-4b02-9e3e-1737f2139417_445508', 'label': 'Loudsmill Combined', 'notation': '6c6c174e-519f-4b02-9e3e-1737f2139417_445508', 'easting': 370810, 'northing': 90350, 'lat': 50.712041, 'long': -2.414784, 'type': [{'@id': 'http://environment.data.gov.uk/flood-monitoring/def/core/Station'}, {'@id': 'http://environment.data.gov.uk/flood-monitoring/def/core/RiverStation'}, {'@id': 'http://environment.data.gov.uk/reference/def/core/SamplingLocation'}, {'@id': 'http://environment.data.gov.uk/flood-monitoring/def/core/RiverFlow'}, {'@id': 'http://environment.data.gov.uk/flood-monitoring/def/core/RiverLevel'}], 'riverName': 'River Frome', 'stationGuid': ['6c6c174e-519f-4b02-9e3e-1737f2139417_445508', '6c6c174e-519f-4b02-9e3e-1737f2139417'], 'wiskiID': '445508', 'RLOIid': '3277', 'rloiStationLink': {'@id': 'https://check-for-flooding.service.gov.uk/station/3277'}, 'dateOpened': 

In [64]:
print(active_stations_rf)
s3_service.save_json_to_s3('flood_stations_metadata/rainfall_stations.json', active_stations_rf)
s3_service.save_json_to_s3('flood_stations_metadata/groundwater_station.json', active_stations_gw)

print(station_ids_wl[:5])  # Re-check the structure immediately before the problematic line
print(type(station_ids_wl))  # Confirm it's a list
print(type(station_ids_wl[0]))  # Confirm the first item is a dictionary

for station in station_ids_wl:
    if not isinstance(station, dict):
        print(f"Non-dict item found: {station}")  # Identify any non-dict items
    else:
        print(station.get('guid'))  # Otherwise, print the 'guid' to confirm access


{'meta': {'@id': 'http://environment.data.gov.uk/hydrology/id/stations?status.label=Active&observedProperty=rainfall&_limit=10000', 'publisher': 'Environment Agency', 'license': 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/', 'licenseName': 'OGL 3', 'comment': 'Hydrology API for sub-daily data', 'version': '2.0.0', 'hasFormat': ['http://environment.data.gov.uk/hydrology/id/stations.ttl?_limit=10000&observedProperty=rainfall&status.label=Active', 'http://environment.data.gov.uk/hydrology/id/stations.rdf?_limit=10000&observedProperty=rainfall&status.label=Active', 'http://environment.data.gov.uk/hydrology/id/stations.geojson?_limit=10000&observedProperty=rainfall&status.label=Active', 'http://environment.data.gov.uk/hydrology/id/stations.html?_limit=10000&observedProperty=rainfall&status.label=Active', 'http://environment.data.gov.uk/hydrology/id/stations.json?_limit=10000&observedProperty=rainfall&status.label=Active', 'http://environment.data.gov.uk/hydrolo

In [14]:


# Extract just the GUIDs from each category
guids_wl = {station['guid'] for station in station_ids_wl}
guids_wf = {station['guid'] for station in station_ids_wf}
guids_gl = {station['guid'] for station in station_ids_gl}
guids_rf = {station['guid'] for station in station_ids_rf}

In [15]:
# Intersection of wl with each other category individually
common_wl_wf = guids_wl.intersection(guids_wf)
common_wl_gl = guids_wl.intersection(guids_gl)
common_wl_rf = guids_wl.intersection(guids_rf)

print(f"Number of stations with both water level and water flow data: {len(common_wl_wf)}")
print(f"Number of stations with both water level and groundwater level data: {len(common_wl_gl)}")
print(f"Number of stations with both water level and rainfall data: {len(common_wl_rf)}")

Number of stations with both water level and water flow data: 733
Number of stations with both water level and groundwater level data: 7
Number of stations with both water level and rainfall data: 45


In [16]:
# Intersection of wl with combinations of two other categories
common_wl_wf_gl = guids_wl.intersection(guids_wf, guids_gl)
common_wl_wf_rf = guids_wl.intersection(guids_wf, guids_rf)
common_wl_gl_rf = guids_wl.intersection(guids_gl, guids_rf)

print(f"Number of stations with water level, water flow, and groundwater level data: {len(common_wl_wf_gl)}")
print(f"Number of stations with water level, water flow, and rainfall data: {len(common_wl_wf_rf)}")
print(f"Number of stations with water level, groundwater level, and rainfall data: {len(common_wl_gl_rf)}")

Number of stations with water level, water flow, and groundwater level data: 5
Number of stations with water level, water flow, and rainfall data: 22
Number of stations with water level, groundwater level, and rainfall data: 0


In [17]:
# Intersection of wl with all other categories
common_guids_all = guids_wl.intersection(guids_wf, guids_gl, guids_rf)

print(f"Number of stations with water level, water flow, groundwater level, and rainfall data: {len(common_guids_all)}")


Number of stations with water level, water flow, groundwater level, and rainfall data: 0


### Extract detailed and simplified metadata of each station and save it to S3 Bucket
#### The most important metadata we need to extract is lat, long, average water level min and max, to find the upper and lower boundary of the normal water levels. However, not every api call will retrieve this information, which is why we need to calculate upper/lower level ourselves if necessary. 

In [18]:
def generate_csv_urls(measures_data):
    """Generate CSV URLs for daily level min and max measures."""
    csv_urls = []
    for item in measures_data['items']:
        if item.get('period') == 86400:  # Daily data
            parameter_name = item.get('parameterName', '').lower()
            if 'level' in parameter_name:  # Focus on level measurements
                valueType = item.get('valueType', '').lower()
                if 'min' in valueType or 'max' in valueType:  # Filter for min and max values
                    specific_name = f"l{valueType}"  # Create specific name like 'lmin' or 'lmax'
                    measure_id_url = item['@id']
                    csv_url = f"{measure_id_url}/readings.csv"
                    csv_urls.append((csv_url, specific_name))
    return csv_urls


In [19]:
def load_csv_data_to_dataframe(csv_urls):
    """Load CSV data from URLs into Pandas DataFrame, focusing on 'lmin' and 'lmax'. Handles EmptyDataError."""
    dfs = []
    for url, parameterName in csv_urls:
        try:
            df = pd.read_csv(url, usecols=['date', 'value'])
            df.rename(columns={'value': parameterName}, inplace=True)  # Rename 'value' to 'lmin' or 'lmax'
            df['date'] = pd.to_datetime(df['date'])
            dfs.append(df)
        except pd.errors.EmptyDataError:
            print(f"Skipping {url} as it appears to be empty or inaccessible.")
            continue

    if not dfs:  # If no dataframes were added, indicate an issue with the data for this station
        return None

    # Merge, sort, and prepare the DataFrame as before
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = pd.merge(combined_df, df, on='date', how='outer')
    
    combined_df.sort_values(by='date', inplace=True)
    combined_df.reset_index(drop=True, inplace=True)
    combined_df.set_index('date', inplace=True)
    
    return combined_df


In [20]:
def calculate_typical_ranges(station_guid):
    # This function assumes you have a way to fetch daily lmin and lmax data for the station
    # For the sake of brevity, the data fetching part is omitted
    # Let's assume daily_data is a DataFrame containing this daily data
    station_measures = get_station_measures_by_guid(station_guid)

    csv_urls = generate_csv_urls(station_measures)
    daily_data = load_csv_data_to_dataframe(csv_urls)
    if daily_data is None:
        return None, None
    daily_data = daily_data.dropna()

    lower_bound_normal = np.nanpercentile(daily_data['lmin'], 10)
    upper_bound_normal = np.nanpercentile(daily_data['lmax'], 
                                          )

    return lower_bound_normal, upper_bound_normal

In [21]:
def get_station_detailed_info_from_ref(station_id):
    url = "http://environment.data.gov.uk/hydrology/id/stations/"
    
    # Fetch basic station information
    initial_endpoint = f"{url}{station_id}.json"
    initial_response = requests.get(initial_endpoint)
    if initial_response.status_code != 200:
        return f"Error fetching initial info: {initial_response.status_code}"

    initial_data = initial_response.json()
    try:
        station_reference = initial_data['items'][0]['stationReference']
    except (IndexError, KeyError):
        return "Station reference not found in initial response."
    
    # Fetch detailed station information
    detailed_url = f"https://environment.data.gov.uk/flood-monitoring/id/stations/{station_reference}"
    detailed_response = requests.get(detailed_url)
    if detailed_response.status_code != 200:
        return f"Error fetching detailed info: {detailed_response.status_code}"
    
    detailed_data = detailed_response.json()['items']  # Correctly accessing the first item in 'items'

    # Extract and construct the station information dictionary
    station_info = {
        "dateOpened": detailed_data.get("dateOpened"),
        "eaAreaName": detailed_data.get("eaAreaName"),
        "eaRegionName": detailed_data.get("eaRegionName"),
        "label": detailed_data.get("label"),
        "lat": detailed_data.get("lat"),
        "long": detailed_data.get("long"),
        "riverName": detailed_data.get("riverName"),
        "town": detailed_data.get("town"),
        "typicalRangeHigh": None,
        "typicalRangeLow": None
    }

    # Check for and extract stageScale data if available
    if 'stageScale' in detailed_data and 'typicalRangeHigh' in detailed_data['stageScale'] and 'typicalRangeLow' in detailed_data['stageScale']:
        station_info["typicalRangeHigh"] = detailed_data['stageScale']['typicalRangeHigh']
        station_info["typicalRangeLow"] = detailed_data['stageScale']['typicalRangeLow']
    else:
        # Placeholder for typical range calculation logic
        typical_range_low, typical_range_high = calculate_typical_ranges(station_id)
        station_info["typicalRangeLow"] = typical_range_low
        station_info["typicalRangeHigh"] = typical_range_high
    
    return station_info

In [22]:
def fetch_and_save_all_stations_metadata(file_path='all_stations_metadata.json'):
    url = "https://environment.data.gov.uk/flood-monitoring/id/stations/"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()['items']
        file_path = f"flood_stations_metadata/{file_path}"
        s3_service.save_json_to_s3(file_path, data)
    else:
        print(f"Failed to fetch stations metadata: HTTP Status Code {response.status_code}")
        
        

In [23]:
def save_station_reference_ids(data, file_path='flood_stations_ids.json'):
    # Replace spaces with underscores in stationReference IDs
    station_references = [item['stationReference'].replace(" ", "_") for item in data if 'stationReference' in item]
    
    file_path = f"flood_stations_metadata/{file_path}"
    s3_service.save_json_to_s3(file_path, station_references)


In [24]:
def extract_station_metadata(station_guid, all_stations_metadata):
    print(f"Fetching general info for station GUID: {station_guid}")
    general_info = get_station_info(station_guid)
    
    if isinstance(general_info, dict) and 'items' in general_info and len(general_info['items']) > 0:
        station_reference = general_info['items'][0].get('stationReference')
        
        print(f"Station reference: {station_reference}")
        
        if station_reference and station_reference in all_stations_metadata:
            detailed_data = all_stations_metadata[station_reference]

            print(f"Found detailed data for station reference: {station_reference}")
            
            station_info = {
                "dateOpened": detailed_data.get("dateOpened"),
                "eaAreaName": detailed_data.get("eaAreaName"),
                "eaRegionName": detailed_data.get("eaRegionName"),
                "label": detailed_data.get("label"),
                "lat": detailed_data.get("lat"),
                "long": detailed_data.get("long"),
                "riverName": detailed_data.get("riverName"),
                "town": detailed_data.get("town"),
                "typicalRangeHigh": None,
                "typicalRangeLow": None
            }
            
            
            # Check for and extract stageScale data if available
            if 'stageScale' in detailed_data and 'typicalRangeHigh' in detailed_data['stageScale'] and 'typicalRangeLow' in detailed_data['stageScale']:
                station_info["typicalRangeHigh"] = detailed_data['stageScale']['typicalRangeHigh']
                station_info["typicalRangeLow"] = detailed_data['stageScale']['typicalRangeLow']
            else:
                # Placeholder for typical range calculation logic
                typical_range_low, typical_range_high = calculate_typical_ranges(guid)
                if typical_range_high or typical_range_low is None:
                    return None
                station_info["typicalRangeLow"] = typical_range_low
                station_info["typicalRangeHigh"] = typical_range_high
                
            return station_info
    else:
        print(f"No general info found for station GUID: {station_guid}")
        return None




In [25]:

fetch_and_save_all_stations_metadata()

all_stations_metadata = s3_service.load_json_from_s3('flood_stations_metadata/all_stations_metadata.json')

save_station_reference_ids(all_stations_metadata)

In [26]:
station_guids = list(common_wl_wf)
reference_ids = set(s3_service.load_json_from_s3('flood_stations_metadata/flood_stations_ids.json'))
confirmed_station_guids = []
confirmed_reference_ids = []

# Iterate over each guid and fetch station info
for guid in station_guids:
    station_info = get_station_info(guid)
    if station_info:
        # Extract stationReference from the items
        station_reference = station_info['items'][0].get('stationReference') if 'items' in station_info and station_info['items'] else None
        if station_reference:
            # Check if the stationReference is in the set of referenceIDs
            if station_reference in reference_ids:
                print("found", guid, station_reference)
                confirmed_station_guids.append(guid)
                confirmed_reference_ids.append(station_reference)
            else:
                print(f"Removing {guid} as its reference ID {station_reference} is not found")


print("Checking for any discrepancies in possesed IDs from both APIs: ")
print(f"Initial number of station GUIDs: {len(station_guids)}")
print(f"Number of confirmed station GUIDs: {len(confirmed_station_guids)}")

print(f"Initial number of reference IDs: {len(reference_ids)}")
print(f"Number of confirmed reference IDs: {len(set(confirmed_reference_ids))}") 

found 63b8893e-1cef-40bc-8755-c0c4fe3e69dd 4074
found 70e18a89-96dd-4a04-8163-fc685e92bb09 3079TH
found 08a56e86-d214-4288-8726-8de31eb55766 1420TH
found f44bf96d-3953-4fec-88bd-30ef4e12e523 2200TH
found 6078ba41-0d4f-4053-aaea-25d628b1d97f 684027
found 29c364fa-5f64-4622-b309-54f6243aaa83 43116
found e9c72be8-dea1-4a5d-8af7-05dce5f419ee E12442
found b2023a27-1c95-4b64-98d2-90bdf4c224ce 690160
found 29434d59-1597-4783-b0b6-9f9a821d30d8 53120
found a99b0223-fd15-4a76-8c45-4958137fa2ed 3040TH
found c5b13f08-cdf0-41ab-8bbf-7b8587744459 F2760
found c8dd48fe-9e31-4096-986a-bf4c67648c7a F25110
found fcdd7420-c000-4db7-99fa-3835a65372fe E45951
found f516b38c-16cc-4cda-94e7-6a85ee01b7b3 48128
found daca0d8d-b3de-4945-81fb-b01f249210f6 5329TH
found f3904234-3063-44ac-b93a-0f52861ddbb4 711610
found 60a090a3-f93b-4065-a56e-ff64e6eb5f5e 693435
found f7fe310d-d263-486b-ab70-a4663975f7bd 2117
found fc8999c2-d9ad-4a1a-887b-2f9c10f11689 2264TH
found 85aefe46-dde4-4338-9d3c-338754cbc0d6 023009
found 12

In [None]:
# Prepare the data structure with paired GUIDs and reference IDs
paired_guids_and_refs = [{"guid": guid, "referenceId": ref_id} for guid, ref_id in zip(confirmed_station_guids, confirmed_reference_ids)]

# Define the file path where you want to save this data
file_path = "flood_stations_metadata/confirmed_guids_refs.json"

# Use the S3 service to save the JSON data
s3_service.save_json_to_s3(file_path, paired_guids_and_refs)

print(f"Confirmed GUIDs and reference IDs have been saved to {file_path}")


In [None]:
station_reference_guid_pairs = s3_service.load_json_from_s3("flood_stations_metadata/confirmed_guids_refs.json")


def fetch_and_save_stage_scale_data_with_fallback(station_reference_id_guid_pairs, s3_service, calculate_typical_ranges_func):
    base_url = "http://environment.data.gov.uk/flood-monitoring/id/stations/"
    stage_scale_path = "/stageScale"

    for pair in station_reference_id_guid_pairs:
        guid = pair["guid"]
        reference_id = pair["referenceId"]
        stage_scale_url = f"{base_url}{reference_id}{stage_scale_path}"
        file_path = f"flood_stations/{guid}/stage_scale.json"

        try:
            response = requests.get(stage_scale_url)
            response.raise_for_status()  # This will raise an HTTPError if the fetch fails
            data = response.json().get('items', {})
            s3_service.save_json_to_s3(file_path, data)
            print(f"Stage scale data for {reference_id} ({guid}) saved successfully.")
        except Exception as e:
            print(f"Failed to fetch or save stage scale data for {reference_id} ({guid}): {e}")
            print("Attempting to calculate typical ranges...")
            try:
                # Fallback to calculating typical ranges
                lower_bound, upper_bound = calculate_typical_ranges_func(guid)
                if lower_bound is not None and upper_bound is not None:
                    fallback_data = {
                        "@context": "http://environment.data.gov.uk/flood-monitoring/meta/context.jsonld",
                        "items": {
                            "typicalRangeHigh": upper_bound,
                            "typicalRangeLow": lower_bound
                        }
                    }
                    s3_service.save_json_to_s3(file_path, fallback_data)
                    print(f"Calculated stage scale data for {reference_id} ({guid}) saved successfully.")
                else:
                    print(f"No data available to calculate typical ranges for {guid}.")
            except Exception as e:
                print(f"An error occurred while calculating stage scale data for {reference_id} ({guid}): {e}")

           
fetch_and_save_stage_scale_data_with_fallback(station_reference_guid_pairs, s3_service, calculate_typical_ranges)

In [None]:
def filter_station_metadata(confirmed_guids_refs_path, all_stations_metadata_path, output_path):
    # Load the confirmed GUIDs and reference IDs
    confirmed_guids_refs = s3_service.load_json_from_s3(confirmed_guids_refs_path)
    confirmed_reference_ids = {pair['referenceId'] for pair in confirmed_guids_refs}
    
    # Load the full station metadata
    all_stations_metadata = s3_service.load_json_from_s3(all_stations_metadata_path)
    
    # Filter the metadata to keep only the stations with reference IDs in the confirmed list
    filtered_stations_metadata = [station for station in all_stations_metadata if station.get("stationReference") in confirmed_reference_ids]
    
    # Save the filtered metadata
    s3_service.save_json_to_s3(output_path, filtered_stations_metadata)

# Replace 'flood_stations_metadata/confirmed_guids_refs.json' and 'all_stations_metadata.json' with your actual paths
confirmed_guids_refs_path = 'flood_stations_metadata/confirmed_guids_refs.json'
all_stations_metadata_path = 'flood_stations_metadata/all_stations_metadata.json'
output_path = 'flood_stations_metadata/project_stations_metadata.json'

# Filter and save the metadata
filter_station_metadata(confirmed_guids_refs_path, all_stations_metadata_path, output_path)

In [None]:
def get_stations_detailed_metadata(station_reference_guid_pairs):
    base_url = "https://environment.data.gov.uk/hydrology/id/stations/"

    for pair in station_reference_guid_pairs:
        guid = pair["guid"]
        detailed_metadata_url = f"{base_url}{guid}"  # Make sure this URL is correct for detailed data
        file_path = f"flood_stations/{guid}/detailed_metadata.json"
        try:
            response = requests.get(detailed_metadata_url)
            if response.ok:  # Checks if response status code is 200
                data = response.json().get('items', {})
                s3_service.save_json_to_s3(file_path, data)
                print(f"Detailed metadata for ({guid}) saved successfully.")
            else:
                print(f"Failed to fetch detailed metadata for ({guid}). Status code: {response.status_code}")
        except Exception as e:
            print(f"Failed to fetch or save detailed data for ({guid}): {e}")

        

In [None]:
get_stations_detailed_metadata(station_reference_guid_pairs)

In [56]:
def separate_and_save_measurement_links(station_reference_guid_pairs, s3_service):
    readings_csv_extension = "/readings.csv"
    
    for pair in station_reference_guid_pairs:
        guid = pair["guid"]
        metadata_path = f"flood_stations/{guid}/detailed_metadata.json"
        metadata_list = s3_service.load_json_from_s3(metadata_path)  # Load the list of metadata items

        # Initialize containers for different measurement intervals
        _15min_measurement_links_with_types = []
        _1d_measurement_links_with_types = []

        for metadata in metadata_list:
            measures = metadata.get('measures', [])
            for measure in measures:
                period = measure.get("period")
                measure_link = measure["@id"] + readings_csv_extension
                parameter_name = measure.get("parameterName", "unknown")
                valueType = measure.get("valueType", "unknown")

                # Simplify type naming based on parameter and value type
                type_name = f"{parameter_name[0].lower()}{valueType[:4]}" if parameter_name != "unknown" else "u"

                link_type_pair = {"link": measure_link, "type": type_name}

                if period == 86400:  # Daily measurements
                    _1d_measurement_links_with_types.append(link_type_pair)
                elif period == 900:  # 15-minute measurements
                    _15min_measurement_links_with_types.append(link_type_pair)

        # Save the links with types to their respective files
        if _15min_measurement_links_with_types:
            _15min_file_path = f"flood_stations/{guid}/15min_measurements_links.json"
            s3_service.save_json_to_s3(_15min_file_path, _15min_measurement_links_with_types)

        if _1d_measurement_links_with_types:
            _1d_file_path = f"flood_stations/{guid}/1d_measurements_links.json"
            s3_service.save_json_to_s3(_1d_file_path, _1d_measurement_links_with_types)
        
        print(f"Measurement links for station {guid} separated and saved successfully.")





In [18]:
station_reference_guid_pairs = s3_service.load_json_from_s3("flood_stations_metadata/confirmed_guids_refs.json")


In [57]:
separate_and_save_measurement_links(station_reference_guid_pairs, s3_service)

Measurement links for station 65b8dffb-ee80-4b9b-b2ba-4670d2ba06c2 separated and saved successfully.
Measurement links for station b6cc814c-4b13-4645-89ee-25e8e3d79dc0 separated and saved successfully.
Measurement links for station 2d88723b-45f6-46e2-b662-6b2573a7fba7 separated and saved successfully.
Measurement links for station a5bf03e9-e63a-4cd5-8bc9-ab64c03808b9 separated and saved successfully.
Measurement links for station c7e88659-7744-4aa0-b62e-93cb6c25ef2b separated and saved successfully.
Measurement links for station fc8999c2-d9ad-4a1a-887b-2f9c10f11689 separated and saved successfully.
Measurement links for station df30fe33-6290-4159-a181-953de9fc9f9b separated and saved successfully.
Measurement links for station 6f727c14-422c-4b7f-922d-b430d9aa07a2 separated and saved successfully.
Measurement links for station 3df68add-46f9-42e7-8439-290f63e195cd separated and saved successfully.
Measurement links for station fce4a0d2-2406-4c67-8bf4-35cd091e6545 separated and saved succ

In [35]:
import math

def calculate_distance(lat1, lon1, lat2, lon2):
    # Haversine formula
    R = 6371  # Earth radius in kilometers
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)

    a = math.sin(dLat/2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dLon/2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c
    return distance

def find_nearest_station(target_lat, target_long, stations):
    nearest_station = None
    nearest_distance = float('inf')
    for station in stations['items']:
        station_lat = station['lat']
        station_long = station['long']
        distance = calculate_distance(target_lat, target_long, station_lat, station_long)
        if distance < nearest_distance:
            nearest_distance = distance
            nearest_station = station
    return nearest_station



In [17]:
def extract_measurement_links(station):
    measure_links = {
        '15min': [],
        'daily': []
    }
    for measure in station.get('measures', []):
        if measure['period'] == 900:  # 15 minutes
            measure_links['15min'].append(measure['@id'])
        elif measure['period'] == 86400:  # Daily
            measure_links['daily'].append(measure['@id'])

    return measure_links


In [50]:
project_stations_metadata = s3_service.load_json_from_s3('flood_stations_metadata/project_stations_metadata.json')

In [67]:
def update_station_measurement_links(guid, rainfall_links):
    # Load existing measurement links
    one_day_links = s3_service.load_json_from_s3(f"flood_stations/{guid}/1d_measurements_links.json")
    fifteen_minute_links = s3_service.load_json_from_s3(f"flood_stations/{guid}/15min_measurements_links.json")

    # Initialize or find existing 'rain' type link index
    rain_link_index_15min = next((index for (index, d) in enumerate(fifteen_minute_links) if d["type"] == "rain"), None)
    rain_link_index_daily = next((index for (index, d) in enumerate(one_day_links) if d["type"] == "rain"), None)

    # Prepare new rainfall link entries
    new_rainfall_links_15min = {'link': rainfall_links['15min'][0] + '/readings.csv', 'type': 'rain'} if rainfall_links['15min'] else None
    new_rainfall_links_daily = {'link': rainfall_links['daily'][0] + '/readings.csv', 'type': 'rain'} if rainfall_links['daily'] else None

    # Update or append the rainfall link
    if new_rainfall_links_15min:
        if rain_link_index_15min is not None:
            fifteen_minute_links[rain_link_index_15min] = new_rainfall_links_15min
        else:
            fifteen_minute_links.append(new_rainfall_links_15min)

    if new_rainfall_links_daily:
        if rain_link_index_daily is not None:
            one_day_links[rain_link_index_daily] = new_rainfall_links_daily
        else:
            one_day_links.append(new_rainfall_links_daily)

    # Save updated lists back to S3
    s3_service.save_json_to_s3(f"flood_stations/{guid}/15min_measurements_links.json", fifteen_minute_links)
    s3_service.save_json_to_s3(f"flood_stations/{guid}/1d_measurements_links.json", one_day_links)





In [42]:
def get_guid_for_station_reference(station_reference, station_reference_guid_pairs):
    """
    Returns the GUID for a given station reference.
    
    Parameters:
    - station_reference (str): The reference ID of the station.
    - station_reference_guid_pairs (list): A list of dictionaries, each holding a 'guid' and 'referenceId'.
    
    Returns:
    - str: The GUID for the given station reference, or None if not found.
    """
    for pair in station_reference_guid_pairs:
        if pair['referenceId'] == station_reference:
            return pair['guid']
    return None

def add_nearest_measurement_links(project_stations_metadata, rainfall_stations, station_reference_guid_pairs):
    for station in project_stations_metadata:
        # Assuming 'stationReference' is the key that holds the station's reference ID
        station_reference = station.get('stationReference')
        guid = get_guid_for_station_reference(station_reference, station_reference_guid_pairs)
        
        if guid:
            # Find the nearest rainfall station
            nearest_rainfall_station = find_nearest_station(station['lat'], station['long'], rainfall_stations)
            
            # Extract measurement links for rainfall
            rainfall_links = extract_measurement_links(nearest_rainfall_station)
            
            # Update the measurement links for this station in your S3 bucket or local storage
            update_station_measurement_links(guid, rainfall_links)
        else:
            print(f"No GUID found for station reference: {station_reference}")


In [68]:
add_nearest_measurement_links(project_stations_metadata, active_stations_rf, station_reference_guid_pairs)

{'15min': ['http://environment.data.gov.uk/hydrology/id/measures/bf217aec-f418-42a6-9b0f-a3de46d7c592-rainfall-t-900-mm-qualified'], 'daily': ['http://environment.data.gov.uk/hydrology/id/measures/bf217aec-f418-42a6-9b0f-a3de46d7c592-rainfall-t-86400-mm-qualified']}
{'15min': ['http://environment.data.gov.uk/hydrology/id/measures/5d800943-7e89-4c43-ae19-0791e533df0a-rainfall-t-900-mm-qualified'], 'daily': ['http://environment.data.gov.uk/hydrology/id/measures/5d800943-7e89-4c43-ae19-0791e533df0a-rainfall-t-86400-mm-qualified']}
{'15min': ['http://environment.data.gov.uk/hydrology/id/measures/5e7efda0-4f88-448b-a0a0-860a2d79c7f7-rainfall-t-900-mm-qualified'], 'daily': ['http://environment.data.gov.uk/hydrology/id/measures/5e7efda0-4f88-448b-a0a0-860a2d79c7f7-rainfall-t-86400-mm-qualified']}
{'15min': ['http://environment.data.gov.uk/hydrology/id/measures/52a0e101-be9f-4680-aa58-529b7e728d84-rainfall-t-900-mm-qualified'], 'daily': ['http://environment.data.gov.uk/hydrology/id/measures/52