In [2]:
!pip install netCDF4

Collecting netCDF4
  Downloading netcdf4-1.7.3-cp311-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.9 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.7 kB)
Downloading netcdf4-1.7.3-cp311-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.5 netCDF4-1.7.3


In [None]:
import xarray as xr
import netCDF4

file_path = "/content/20250808_prof.nc"

ds = xr.open_dataset(file_path)
print(ds)

<xarray.Dataset> Size: 9MB
Dimensions:                       (N_PROF: 70, N_PARAM: 3, N_LEVELS: 1485,
                                   N_CALIB: 2, N_HISTORY: 0)
Dimensions without coordinates: N_PROF, N_PARAM, N_LEVELS, N_CALIB, N_HISTORY
Data variables: (12/64)
    DATA_TYPE                     object 8B ...
    FORMAT_VERSION                object 8B ...
    HANDBOOK_VERSION              object 8B ...
    REFERENCE_DATE_TIME           object 8B ...
    DATE_CREATION                 object 8B ...
    DATE_UPDATE                   object 8B ...
    ...                            ...
    HISTORY_ACTION                (N_HISTORY, N_PROF) object 0B ...
    HISTORY_PARAMETER             (N_HISTORY, N_PROF) object 0B ...
    HISTORY_START_PRES            (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_STOP_PRES             (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_PREVIOUS_VALUE        (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_QCTEST                (N_HISTORY, N_PROF) object 0B .

In [None]:
import xarray as xr
import pandas as pd

# Extract variables
temp = ds["TEMP"].values
pres = ds["PRES"].values
sal  = ds["PSAL"].values
lat  = ds["LATITUDE"].values
lon  = ds["LONGITUDE"].values
time = ds["JULD"].values

# Flatten the data to align profiles with levels
nprof, nlevels = temp.shape
df = pd.DataFrame({
    "Profile": [i for i in range(nprof) for _ in range(nlevels)],
    "Latitude": [lat[i] for i in range(nprof) for _ in range(nlevels)],
    "Longitude": [lon[i] for i in range(nprof) for _ in range(nlevels)],
    "Time": [time[i] for i in range(nprof) for _ in range(nlevels)],
    "Pressure": pres.flatten(),
    "Temperature": temp.flatten(),
    "Salinity": sal.flatten()
})

df.head(50)


Unnamed: 0,Profile,Latitude,Longitude,Time,Pressure,Temperature,Salinity
0,0,-54.880965,88.561498,2025-08-08 23:58:30,3.9,1.913,29.812
1,0,-54.880965,88.561498,2025-08-08 23:58:30,5.0,1.911,29.815001
2,0,-54.880965,88.561498,2025-08-08 23:58:30,6.2,1.919,29.813
3,0,-54.880965,88.561498,2025-08-08 23:58:30,7.1,1.92,29.815001
4,0,-54.880965,88.561498,2025-08-08 23:58:30,8.0,1.921,29.815001
5,0,-54.880965,88.561498,2025-08-08 23:58:30,8.8,1.92,29.815001
6,0,-54.880965,88.561498,2025-08-08 23:58:30,9.7,1.921,29.815001
7,0,-54.880965,88.561498,2025-08-08 23:58:30,10.8,1.92,29.813999
8,0,-54.880965,88.561498,2025-08-08 23:58:30,11.9,1.921,29.813999
9,0,-54.880965,88.561498,2025-08-08 23:58:30,12.7,1.92,29.815001


In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Base URL
BASE_URL = "https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/"
LOCAL_DIR = "/content/data/indian_ocean/2025/01/"

# Ensure local directory exists
os.makedirs(LOCAL_DIR, exist_ok=True)

def get_nc_files(url):
    """Scrape the directory listing to get .nc file URLs"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.nc')]
    return [urljoin(url, link) for link in links]

def download_file(file_url, local_path):
    """Download a file if not already present"""
    if os.path.exists(local_path):
        print(f"✅ Skipping (exists): {local_path}")
        return
    print(f"⬇️ Downloading: {file_url}")
    r = requests.get(file_url, stream=True)
    with open(local_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"✅ Saved: {local_path}")

def main():
    file_urls = get_nc_files(BASE_URL)
    for file_url in file_urls:
        filename = os.path.basename(file_url)
        local_path = os.path.join(LOCAL_DIR, filename)
        download_file(file_url, local_path)

if __name__ == "__main__":
    main()


⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250101_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250101_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250102_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250102_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250103_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250103_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250104_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250104_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250105_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250105_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250106_prof.nc
✅ Saved: /content/data/indian_ocean/2025/01/20250106_prof.nc
⬇️ Downloading: https://data-argo.ifremer.fr/geo/indian_ocean/2025/01/20250107_prof.nc
✅ Saved: /content/data/in

In [3]:
import xarray as xr
import netCDF4

file_path = "/content/data/indian_ocean/2025/01/20250101_prof.nc"

ds = xr.open_dataset(file_path)
print(ds)

<xarray.Dataset> Size: 9MB
Dimensions:                       (N_PROF: 77, N_PARAM: 3, N_LEVELS: 1334,
                                   N_CALIB: 3, N_HISTORY: 0)
Dimensions without coordinates: N_PROF, N_PARAM, N_LEVELS, N_CALIB, N_HISTORY
Data variables: (12/64)
    DATA_TYPE                     object 8B ...
    FORMAT_VERSION                object 8B ...
    HANDBOOK_VERSION              object 8B ...
    REFERENCE_DATE_TIME           object 8B ...
    DATE_CREATION                 object 8B ...
    DATE_UPDATE                   object 8B ...
    ...                            ...
    HISTORY_ACTION                (N_HISTORY, N_PROF) object 0B ...
    HISTORY_PARAMETER             (N_HISTORY, N_PROF) object 0B ...
    HISTORY_START_PRES            (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_STOP_PRES             (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_PREVIOUS_VALUE        (N_HISTORY, N_PROF) float32 0B ...
    HISTORY_QCTEST                (N_HISTORY, N_PROF) object 0B .

# Task
Summarize the extracted `N_PROF` and `N_LEVELS` information for all NetCDF files in the directory `/content/data/indian_ocean/2025/01/`.

## List NetCDF Files

### Subtask:
Identify and list all `.nc` files present in the directory `/content/data/indian_ocean/2025/01/`.


**Reasoning**:
The first instruction is to import the `os` module, which is necessary for interacting with the file system to list files in a directory.



In [4]:
import os

print("os module imported.")

os module imported.


**Reasoning**:
Now that the `os` module is imported, I will define the directory path, list all items in it using `os.listdir()`, and filter the list to include only files ending with `.nc`, storing them in a variable as instructed.



In [5]:
directory_path = "/content/data/indian_ocean/2025/01/"
all_files = os.listdir(directory_path)
nc_files = [file for file in all_files if file.endswith('.nc')]

print(f"Found {len(nc_files)} NetCDF files:")
for file in nc_files:
    print(file)

Found 31 NetCDF files:
20250118_prof.nc
20250125_prof.nc
20250126_prof.nc
20250101_prof.nc
20250130_prof.nc
20250121_prof.nc
20250105_prof.nc
20250115_prof.nc
20250123_prof.nc
20250122_prof.nc
20250116_prof.nc
20250102_prof.nc
20250103_prof.nc
20250120_prof.nc
20250113_prof.nc
20250108_prof.nc
20250104_prof.nc
20250129_prof.nc
20250117_prof.nc
20250131_prof.nc
20250109_prof.nc
20250112_prof.nc
20250127_prof.nc
20250107_prof.nc
20250111_prof.nc
20250106_prof.nc
20250110_prof.nc
20250114_prof.nc
20250128_prof.nc
20250119_prof.nc
20250124_prof.nc


## Extract Dimensions from Each File

### Subtask:
For each .nc file, open it using xarray, extract the values for N_PROF and N_LEVELS dimensions, and store this information.


**Reasoning**:
I need to import the `xarray` library to open and process the NetCDF files. This is the first step as per the instructions.



In [6]:
import xarray as xr

print("xarray imported as xr.")

xarray imported as xr.


**Reasoning**:
Now that `xarray` is imported, I will create an empty list `dimensions_data`, iterate through the `nc_files`, construct the full path for each file, open it with `xr.open_dataset()`, extract `N_PROF` and `N_LEVELS` dimensions using `.get()` to handle missing dimensions gracefully, append the information to `dimensions_data`, and then close each dataset to manage resources, as per the instructions.



In [7]:
dimensions_data = []

for filename in nc_files:
    full_path = os.path.join(directory_path, filename)
    with xr.open_dataset(full_path) as ds:
        n_prof_val = ds.dims.get('N_PROF', None)
        n_levels_val = ds.dims.get('N_LEVELS', None)
        dimensions_data.append({
            'filename': filename,
            'N_PROF': n_prof_val,
            'N_LEVELS': n_levels_val
        })

print(f"Extracted dimensions for {len(dimensions_data)} files.")
# Optionally, display the first few entries
# print(dimensions_data[:5])

  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_v

Extracted dimensions for 31 files.


  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)
  n_prof_val = ds.dims.get('N_PROF', None)
  n_levels_val = ds.dims.get('N_LEVELS', None)


**Reasoning**:
To address the `FutureWarning` about `Dataset.dims` being deprecated for dimension lengths, I will update the code to use `Dataset.sizes` which is the recommended way to access dimension lengths. This will remove the warnings while keeping the functionality the same.



In [8]:
dimensions_data = []

for filename in nc_files:
    full_path = os.path.join(directory_path, filename)
    with xr.open_dataset(full_path) as ds:
        n_prof_val = ds.sizes.get('N_PROF', None)
        n_levels_val = ds.sizes.get('N_LEVELS', None)
        dimensions_data.append({
            'filename': filename,
            'N_PROF': n_prof_val,
            'N_LEVELS': n_levels_val
        })

print(f"Extracted dimensions for {len(dimensions_data)} files.")
# Optionally, display the first few entries
# print(dimensions_data[:5])

Extracted dimensions for 31 files.


## Display Dimensions

### Subtask:
Print the filename along with its corresponding N_PROF and N_LEVELS values for each processed file.


**Reasoning**:
I need to iterate through the `dimensions_data` list and print the filename, N_PROF, and N_LEVELS for each entry as instructed.



In [9]:
print("Summary of N_PROF and N_LEVELS for each file:")
for entry in dimensions_data:
    print(f"File: {entry['filename']}, N_PROF: {entry['N_PROF']}, N_LEVELS: {entry['N_LEVELS']}")

Summary of N_PROF and N_LEVELS for each file:
File: 20250118_prof.nc, N_PROF: 68, N_LEVELS: 1165
File: 20250125_prof.nc, N_PROF: 89, N_LEVELS: 1218
File: 20250126_prof.nc, N_PROF: 91, N_LEVELS: 1411
File: 20250101_prof.nc, N_PROF: 77, N_LEVELS: 1334
File: 20250130_prof.nc, N_PROF: 69, N_LEVELS: 1361
File: 20250121_prof.nc, N_PROF: 81, N_LEVELS: 1294
File: 20250105_prof.nc, N_PROF: 87, N_LEVELS: 1021
File: 20250115_prof.nc, N_PROF: 96, N_LEVELS: 1193
File: 20250123_prof.nc, N_PROF: 98, N_LEVELS: 1394
File: 20250122_prof.nc, N_PROF: 72, N_LEVELS: 1544
File: 20250116_prof.nc, N_PROF: 91, N_LEVELS: 1200
File: 20250102_prof.nc, N_PROF: 86, N_LEVELS: 1323
File: 20250103_prof.nc, N_PROF: 88, N_LEVELS: 1521
File: 20250120_prof.nc, N_PROF: 74, N_LEVELS: 1409
File: 20250113_prof.nc, N_PROF: 91, N_LEVELS: 1486
File: 20250108_prof.nc, N_PROF: 76, N_LEVELS: 2000
File: 20250104_prof.nc, N_PROF: 63, N_LEVELS: 1117
File: 20250129_prof.nc, N_PROF: 79, N_LEVELS: 1354
File: 20250117_prof.nc, N_PROF: 75, 

## Final Task

### Subtask:
Summarize the extracted N_PROF and N_LEVELS information for all files in the directory.


## Summary:

### Data Analysis Key Findings

*   A total of 31 NetCDF files were identified in the directory `/content/data/indian_ocean/2025/01/`.
*   For each of these files, the `N_PROF` and `N_LEVELS` dimension values were successfully extracted using `xarray`.
*   The extraction process was refined to use `ds.sizes.get()` for retrieving dimension lengths, which addressed `FutureWarning` messages related to `Dataset.dims` and ensures compatibility with future `xarray` versions.
*   The `N_PROF` and `N_LEVELS` values varied across files; for example:
    *   `20250118_prof.nc` had `N_PROF`: 68 and `N_LEVELS`: 1165.
    *   `20250125_prof.nc` had `N_PROF`: 89 and `N_LEVELS`: 1218.
    *   `20250101_prof.nc` had `N_PROF`: 77 and `N_LEVELS`: 1334.

### Insights or Next Steps

*   This summary provides a foundational understanding of the profile and level counts within each NetCDF file, which is crucial for subsequent data processing and analysis, such as ensuring consistent data structures or identifying outliers.
*   Further analysis could involve calculating descriptive statistics (e.g., minimum, maximum, average, standard deviation) for `N_PROF` and `N_LEVELS` across all files to understand the overall characteristics and variability of the dataset.


In [10]:
total_n_prof = sum([entry['N_PROF'] for entry in dimensions_data])
total_n_levels = sum([entry['N_LEVELS'] for entry in dimensions_data])
num_files = len(dimensions_data)

average_n_prof = total_n_prof / num_files
average_n_levels = total_n_levels / num_files

print(f"Average N_PROF across all files: {average_n_prof:.2f}")
print(f"Average N_LEVELS across all files: {average_n_levels:.2f}")

Average N_PROF across all files: 78.52
Average N_LEVELS across all files: 1385.55


In [50]:
import xarray as xr
import pandas as pd
import numpy as np


def extract_profiles(nc_file):
    ds = xr.open_dataset(nc_file)

    # --- Identify dimensions safely ---
    n_prof = ds.sizes.get("N_PROF", 0)
    n_levels = ds.sizes.get("N_LEVELS", 0)
    if n_prof == 0 or n_levels == 0:
        print(f"⚠️ Skipping file (no valid dimensions): {nc_file}")
        ds.close()
        return pd.DataFrame(), pd.DataFrame()

    # --- Extract core variables ---
    pres = ds["PRES"].values if "PRES" in ds.variables else np.full((n_prof, n_levels), np.nan)
    temp = ds["TEMP"].values if "TEMP" in ds.variables else np.full((n_prof, n_levels), np.nan)
    psal = ds["PSAL"].values if "PSAL" in ds.variables else np.full((n_prof, n_levels), np.nan)
    lat = ds["LATITUDE"].values if "LATITUDE" in ds.variables else np.full(n_prof, np.nan)
    lon = ds["LONGITUDE"].values if "LONGITUDE" in ds.variables else np.full(n_prof, np.nan)

    # --- Build measurements table (normalized) ---
    measurements_records = []
    for i in range(n_prof):
        for j in range(n_levels):
            # Skip empty measurements
            if np.isnan(pres[i, j]) or np.isnan(temp[i, j]) or np.isnan(psal[i, j]):
                continue
            measurements_records.append({
                "profile_id": i,  # local profile index within file
                "level": j,
                "pressure": float(pres[i, j]),
                "temperature": float(temp[i, j]),
                "salinity": float(psal[i, j])
            })

    measurements_df = pd.DataFrame(measurements_records)

    # --- Build profiles table (metadata) ---
    profiles_records = []
    for i in range(n_prof):
        # Try to get datetime safely
        try:
            datetime_val = pd.to_datetime(ds["JULD"][i].values)
        except Exception:
            datetime_val = None

        # Handle float_id safely (can be array of chars)
        if "PLATFORM_NUMBER" in ds.variables:
            float_id = str(ds["PLATFORM_NUMBER"][i].values)
            float_id = float_id.strip("[]").replace(" ", "").replace("'", "")
        else:
            float_id = "unknown"

        record = {
            "profile_id": i,
            "float_id": float_id,
            "cycle_number": int(ds["CYCLE_NUMBER"][i].values) if "CYCLE_NUMBER" in ds.variables else None,
            "datetime": datetime_val,
            "latitude": float(lat[i]) if not np.isnan(lat[i]) else None,
            "longitude": float(lon[i]) if not np.isnan(lon[i]) else None,
            "project_name": str(ds.attrs.get("project_name", "ARGO")),
            "data_mode": str(ds.attrs.get("data_mode", "R")),
            "platform_type": str(ds.attrs.get("platform_type", "unknown")),
            "institution": str(ds.attrs.get("institution", "unknown")),
            "title": str(ds.attrs.get("title", "unknown")),
            "source": str(ds.attrs.get("source", "unknown")),
            "min_pressure": float(np.nanmin(pres[i])) if np.any(~np.isnan(pres[i])) else None,
            "max_pressure": float(np.nanmax(pres[i])) if np.any(~np.isnan(pres[i])) else None,
            "measured_parameters": ", ".join([v for v in ds.data_vars if v not in ["N_PROF", "N_LEVELS"]])
        }
        profiles_records.append(record)

    profiles_df = pd.DataFrame(profiles_records)

    ds.close()

    print(f"✅ Extracted {len(profiles_df)} profiles and {len(measurements_df)} measurements.")
    return profiles_df, measurements_df


# Example usage
if __name__ == "__main__":
    nc_file = "/content/data/indian_ocean/2025/01/20250101_prof.nc"
    profiles_df, measurements_df = extract_profiles(nc_file)

    print("\n--- Profiles Table ---")
    print(profiles_df.head())

    print("\n--- Measurements Table ---")
    print(measurements_df.head())


✅ Extracted 77 profiles and 49620 measurements.

--- Profiles Table ---
   profile_id  float_id  cycle_number            datetime   latitude  \
0           0  b5906527            94 2025-01-01 23:10:46 -43.513000   
1           1  b5905521            96 2025-01-01 22:30:06  -8.248500   
2           2  b5906256           168 2025-01-01 22:23:17 -43.034000   
3           3  b1902259           164 2025-01-01 22:06:27 -33.786720   
4           4  b7900576           146 2025-01-01 21:39:20 -45.424212   

   longitude project_name data_mode platform_type institution  \
0   35.46800         ARGO         R       unknown     FR GDAC   
1  105.79380         ARGO         R       unknown     FR GDAC   
2  134.93100         ARGO         R       unknown     FR GDAC   
3   55.46375         ARGO         R       unknown     FR GDAC   
4   43.18347         ARGO         R       unknown     FR GDAC   

                         title      source  min_pressure  max_pressure  \
0  Argo float vertical profile

## profiles_df → Vector DB

This DataFrame has all the contextual metadata about each profile:

float_id, cycle_number, datetime, latitude, longitude,
project_name, data_mode, platform_type, institution, title,
source, measured_parameters, min_pressure, max_pressure


LLMs can use this to answer questions like:

“Which Argo floats were deployed by INCOIS in January 2025?”
“Find the float that measured deepest salinity profiles near equator.”

These don’t need precise numeric filtering — instead, they’re semantic / contextual.

## measurements_df → PostgreSQL

Contains the actual depth-wise numeric data:

profile_id, level, pressure, temperature, salinity


This is what your system queries when the user asks:

“What’s the temperature 30 m off the coast of Tamil Nadu?”
“Show me salinity at 50 dbar for profile 102.”

PostgreSQL can handle these fast and precisely using indexes on pressure, latitude, longitude.