In [2]:
!pip install bs4

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Base URL
BASE_URL = "https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/"
LOCAL_DIR = "data/indian_ocean/2025/01/"

# Ensure local directory exists
os.makedirs(LOCAL_DIR, exist_ok=True)

def get_nc_files(url):
    """Scrape the directory listing to get .nc file URLs"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.nc')]
    return [urljoin(url, link) for link in links]

def download_file(file_url, local_path):
    """Download a file if not already present"""
    if os.path.exists(local_path):
        print(f"✅ Skipping (exists): {local_path}")
        return
    print(f"⬇️ Downloading: {file_url}")
    r = requests.get(file_url, stream=True)
    with open(local_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"✅ Saved: {local_path}")

def main():
    file_urls = get_nc_files(BASE_URL)
    for file_url in file_urls:
        filename = os.path.basename(file_url)
        local_path = os.path.join(LOCAL_DIR, filename)
        download_file(file_url, local_path)

if __name__ == "__main__":
    main()


✅ Skipping (exists): data/indian_ocean/2025/01/20250101_prof.nc
✅ Skipping (exists): data/indian_ocean/2025/01/20250102_prof.nc
✅ Skipping (exists): data/indian_ocean/2025/01/20250103_prof.nc
✅ Skipping (exists): data/indian_ocean/2025/01/20250104_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250105_prof.nc
✅ Saved: data/indian_ocean/2025/01/20250105_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250106_prof.nc
✅ Saved: data/indian_ocean/2025/01/20250106_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250107_prof.nc
✅ Saved: data/indian_ocean/2025/01/20250107_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250108_prof.nc
✅ Saved: data/indian_ocean/2025/01/20250108_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250109_prof.nc
✅ Saved: data/indian_ocean/2025/01/20250109_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/

In [5]:
print("Summary of N_PROF and N_LEVELS for each file:")
for entry in dimensions_data:
    print(f"File: {entry['filename']}, N_PROF: {entry['N_PROF']}, N_LEVELS: {entry['N_LEVELS']}")

Summary of N_PROF and N_LEVELS for each file:


NameError: name 'dimensions_data' is not defined

## Final Task

### Subtask:
Summarize the extracted N_PROF and N_LEVELS information for all files in the directory.


## Summary:

### Data Analysis Key Findings

*   A total of 31 NetCDF files were identified in the directory `/content/data/indian_ocean/2025/01/`.
*   For each of these files, the `N_PROF` and `N_LEVELS` dimension values were successfully extracted using `xarray`.
*   The extraction process was refined to use `ds.sizes.get()` for retrieving dimension lengths, which addressed `FutureWarning` messages related to `Dataset.dims` and ensures compatibility with future `xarray` versions.
*   The `N_PROF` and `N_LEVELS` values varied across files; for example:
    *   `20250118_prof.nc` had `N_PROF`: 68 and `N_LEVELS`: 1165.
    *   `20250125_prof.nc` had `N_PROF`: 89 and `N_LEVELS`: 1218.
    *   `20250101_prof.nc` had `N_PROF`: 77 and `N_LEVELS`: 1334.

### Insights or Next Steps

*   This summary provides a foundational understanding of the profile and level counts within each NetCDF file, which is crucial for subsequent data processing and analysis, such as ensuring consistent data structures or identifying outliers.
*   Further analysis could involve calculating descriptive statistics (e.g., minimum, maximum, average, standard deviation) for `N_PROF` and `N_LEVELS` across all files to understand the overall characteristics and variability of the dataset.


In [None]:
total_n_prof = sum([entry['N_PROF'] for entry in dimensions_data])
total_n_levels = sum([entry['N_LEVELS'] for entry in dimensions_data])
num_files = len(dimensions_data)

average_n_prof = total_n_prof / num_files
average_n_levels = total_n_levels / num_files

print(f"Average N_PROF across all files: {average_n_prof:.2f}")
print(f"Average N_LEVELS across all files: {average_n_levels:.2f}")

Average N_PROF across all files: 78.52
Average N_LEVELS across all files: 1385.55


In [None]:
import xarray as xr
import pandas as pd
import numpy as np

def extract_profiles(nc_file):
    ds = xr.open_dataset(nc_file)

    # --- Identify dimensions safely ---
    n_prof = ds.sizes.get("N_PROF", 0)
    n_levels = ds.sizes.get("N_LEVELS", 0)
    if n_prof == 0 or n_levels == 0:
        print(f"⚠️ Skipping file (no valid dimensions): {nc_file}")
        ds.close()
        return pd.DataFrame(), pd.DataFrame()

    # --- Extract core variables ---
    pres = ds["PRES"].values if "PRES" in ds.variables else np.full((n_prof, n_levels), np.nan)
    temp = ds["TEMP"].values if "TEMP" in ds.variables else np.full((n_prof, n_levels), np.nan)
    psal = ds["PSAL"].values if "PSAL" in ds.variables else np.full((n_prof, n_levels), np.nan)
    lat = ds["LATITUDE"].values if "LATITUDE" in ds.variables else np.full(n_prof, np.nan)
    lon = ds["LONGITUDE"].values if "LONGITUDE" in ds.variables else np.full(n_prof, np.nan)

    # --- Build measurements table (normalized) ---
    measurements_records = []
    for i in range(n_prof):
        for j in range(n_levels):
            # Skip empty measurements
            if np.isnan(pres[i, j]) or np.isnan(temp[i, j]) or np.isnan(psal[i, j]):
                continue
            measurements_records.append({
                "profile_id": i,  # local profile index within file
                "level": j,
                "latitude": float(lat[i]) if not np.isnan(lat[i]) else None,
            "longitude": float(lon[i]) if not np.isnan(lon[i]) else None,
                "pressure": float(pres[i, j]),
                "temperature": float(temp[i, j]),
                "salinity": float(psal[i, j])
            })

    measurements_df = pd.DataFrame(measurements_records)

    # --- Build profiles table (metadata) ---
    profiles_records = []
    for i in range(n_prof):
        # Try to get datetime safely
        try:
            datetime_val = pd.to_datetime(ds["JULD"][i].values)
        except Exception:
            datetime_val = None

        # Handle float_id safely (can be array of chars)
        if "PLATFORM_NUMBER" in ds.variables:
            float_id = str(ds["PLATFORM_NUMBER"][i].values)
            float_id = float_id.strip("[]").replace(" ", "").replace("'", "")
        else:
            float_id = "unknown"

        record = {
            "profile_id": i,
            "float_id": float_id,
            "cycle_number": int(ds["CYCLE_NUMBER"][i].values) if "CYCLE_NUMBER" in ds.variables else None,
            "datetime": datetime_val,
            "latitude": float(lat[i]) if not np.isnan(lat[i]) else None,
            "longitude": float(lon[i]) if not np.isnan(lon[i]) else None,
            "project_name": str(ds.attrs.get("project_name", "ARGO")),
            "data_mode": str(ds.attrs.get("data_mode", "R")),
            "platform_type": str(ds.attrs.get("platform_type", "unknown")),
            "institution": str(ds.attrs.get("institution", "unknown")),
            "title": str(ds.attrs.get("title", "unknown")),
            "source": str(ds.attrs.get("source", "unknown")),
            "min_pressure": float(np.nanmin(pres[i])) if np.any(~np.isnan(pres[i])) else None,
            "max_pressure": float(np.nanmax(pres[i])) if np.any(~np.isnan(pres[i])) else None,
            "measured_parameters": ", ".join([v for v in ds.data_vars if v not in ["N_PROF", "N_LEVELS"]])
        }
        profiles_records.append(record)

    profiles_df = pd.DataFrame(profiles_records)

    ds.close()

    print(f"✅ Extracted {len(profiles_df)} profiles and {len(measurements_df)} measurements.")
    return profiles_df, measurements_df


# Example usage
if __name__ == "__main__":
    nc_file = "/content/data/indian_ocean/2025/01/20250101_prof.nc"
    profiles_df, measurements_df = extract_profiles(nc_file)

    print("\n--- Profiles Table ---")
    print(profiles_df.head())

    print("\n--- Measurements Table ---")
    print(measurements_df.head())


✅ Extracted 77 profiles and 49620 measurements.

--- Profiles Table ---
   profile_id  float_id  cycle_number            datetime   latitude  \
0           0  b5906527            94 2025-01-01 23:10:46 -43.513000   
1           1  b5905521            96 2025-01-01 22:30:06  -8.248500   
2           2  b5906256           168 2025-01-01 22:23:17 -43.034000   
3           3  b1902259           164 2025-01-01 22:06:27 -33.786720   
4           4  b7900576           146 2025-01-01 21:39:20 -45.424212   

   longitude project_name data_mode platform_type institution  \
0   35.46800         ARGO         R       unknown     FR GDAC   
1  105.79380         ARGO         R       unknown     FR GDAC   
2  134.93100         ARGO         R       unknown     FR GDAC   
3   55.46375         ARGO         R       unknown     FR GDAC   
4   43.18347         ARGO         R       unknown     FR GDAC   

                         title      source  min_pressure  max_pressure  \
0  Argo float vertical profile

## profiles_df → Vector DB

This DataFrame has all the contextual metadata about each profile:

float_id, cycle_number, datetime, latitude, longitude,
project_name, data_mode, platform_type, institution, title,
source, measured_parameters, min_pressure, max_pressure


LLMs can use this to answer questions like:

“Which Argo floats were deployed by INCOIS in January 2025?”
“Find the float that measured deepest salinity profiles near equator.”

These don’t need precise numeric filtering — instead, they’re semantic / contextual.

## measurements_df → PostgreSQL

Contains the actual depth-wise numeric data:

profile_id, level, pressure, temperature, salinity


This is what your system queries when the user asks:

“What’s the temperature 30 m off the coast of Tamil Nadu?”
“Show me salinity at 50 dbar for profile 102.”

PostgreSQL can handle these fast and precisely using indexes on pressure, latitude, longitude.