In [2]:
!pip install pydantic
!pip install netCDF4
!pip install xarray

from netCDF4 import Dataset
from pydantic import BaseModel, Field
from typing import List, Optional
import os
import xarray as xr

# Display when done
print('Libraries imported')

Collecting netCDF4
  Downloading netCDF4-1.7.1.post2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.1.post2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.4 netCDF4-1.7.1.post2
Libraries imported


In [3]:
class NetCDFMetadata(BaseModel):
    dimensions: dict = Field(..., description="Dimensions of the NetCDF file.")
    variables: List[str] = Field(..., description="Variables available in the NetCDF file.")
    attributes: dict = Field(..., description="Global attributes of the NetCDF file.")
    file_name: str = Field(..., description="The name of the NetCDF file.")

# Display when done
print('NetCDFMetadata model created')

NetCDFMetadata model created


In [4]:
def extract_netcdf_metadata(file_path: str) -> NetCDFMetadata:
    with Dataset(file_path, 'r') as nc:
        dimensions = {dim: len(nc.dimensions[dim]) for dim in nc.dimensions}
        variables = list(nc.variables.keys())
        attributes = {attr: nc.getncattr(attr) for attr in nc.ncattrs()}
        file_name = os.path.basename(file_path)
        return NetCDFMetadata(
            dimensions=dimensions,
            variables=variables,
            attributes=attributes,
            file_name=file_name
        )

# Display when done
print('Metadata extraction function created')

Metadata extraction function created


In [5]:
# List your NetCDF files
netcdf_files = ['/content/gom_t008.nc']

# Extract metadata for each file
all_metadata = [extract_netcdf_metadata(f) for f in netcdf_files]

# Display the extracted metadata
for metadata in all_metadata:
    print(metadata)

# Display when done
print('Metadata extraction completed')

dimensions={'lat': 346, 'lon': 541, 'depth': 40, 'time': 1} variables=['time', 'tau', 'depth', 'lat', 'lon', 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el'] attributes={'classification_level': 'UNCLASSIFIED', 'distribution_statement': 'Approved for public release; distribution unlimited.', 'downgrade_date': 'not applicable', 'classification_authority': 'not applicable', 'institution': 'Naval Oceanographic Office', 'source': 'HYCOM archive file', 'history': 'archv2ncdf3z', 'field_type': 'instantaneous', 'Conventions': 'CF-1.6 NAVO_netcdf_v1.1'} file_name='gom_t008.nc'
Metadata extraction completed


In [14]:
#!pip install llama-index
#!pip install chromadb
#!pip install openai
#!pip install llama-index-vector-stores-chroma

from llama_index.core import StorageContext, VectorStoreIndex, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import os
from llama_index.llms.openai import OpenAI

os.environ['OPENAI_API_KEY'] = 'sk-RlyK9UahfWCbcQJcz7iEHqkxvCNj9vVHREOhSrRuS0T3BlbkFJIMns3sTSMPRlgx4f1x-k-XGkmmXO3EQFSejYlcD2YA'

# Set up LlamaIndex Settings
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.1)
Settings.embed_model = OpenAIEmbedding()

# Chroma settings
chroma_path = './chroma_db'
chroma_collection_name = 'chrm'

# Display when done
print('LlamaIndex components loaded')


LlamaIndex components loaded


In [15]:
# Load or create vector store
if os.path.exists(chroma_path):
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_vector_store(vector_store)
    print('Vector store loaded')
else:
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    # Ensure documents are properly formatted
    documents = [{'text': doc.text, 'metadata': doc.metadata} for doc in all_metadata]

    # Create the index
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
    print('Vector store created')

Vector store loaded


In [16]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo

# Prepare metadata schema
all_metadata_info = []
for field_name, field_info in NetCDFMetadata.__fields__.items():
    all_metadata_info.append(
        MetadataInfo(
            name=field_name,
            type=str(field_info.annotation),
            description=field_info.description,
        )
    )

vector_store_info = VectorStoreInfo(
    content_info="list of NetCDF files metadata",
    metadata_info=all_metadata_info,
)

retriever = VectorIndexAutoRetriever(index, vector_store_info, verbose=True)
print('Metadata schema prepared')

Metadata schema prepared


In [18]:
import time
from llama_index.core.query_engine import RetrieverQueryEngine

def retry_request(func, retries=3, delay=5):
    for attempt in range(retries):
        try:
            return func()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(delay)
    raise RuntimeError("All retry attempts failed.")

# Set up the query engine
query_engine = RetrieverQueryEngine.from_args(retriever=retriever, streaming=True)

def make_query():
    return query_engine.query('What variables are in the first NetCDF file?')

# Retry the query request
try:
    resp = retry_request(make_query)
    for token in resp.response_gen:
        print(token, end="")
except Exception as e:
    print(f"Query failed: {e}")

print('Query executed')

Using query str: variables in the first NetCDF file
Using filters: []
Empty ResponseQuery executed


In [19]:
import netCDF4

# Check variables in the first NetCDF file
file_path = '/content/gom_t008.nc'
dataset = netCDF4.Dataset(file_path, 'r')

print("Variables in the NetCDF file:")
for var in dataset.variables:
    print(var)


Variables in the NetCDF file:
time
tau
depth
lat
lon
water_u
water_v
water_temp
salinity
surf_el


In [21]:
import netCDF4 as nc

def extract_metadata(netcdf_file):
    dataset = nc.Dataset(netcdf_file)
    metadata = {
        'variables': list(dataset.variables.keys()),
        'dimensions': list(dataset.dimensions.keys()),
        'attributes': {attr: getattr(dataset, attr) for attr in dataset.ncattrs()}
    }
    dataset.close()
    return metadata

# Example of extracting metadata
metadata = extract_metadata('/content/gom_t008.nc')
print(metadata)

{'variables': ['time', 'tau', 'depth', 'lat', 'lon', 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el'], 'dimensions': ['lat', 'lon', 'depth', 'time'], 'attributes': {'classification_level': 'UNCLASSIFIED', 'distribution_statement': 'Approved for public release; distribution unlimited.', 'downgrade_date': 'not applicable', 'classification_authority': 'not applicable', 'institution': 'Naval Oceanographic Office', 'source': 'HYCOM archive file', 'history': 'archv2ncdf3z', 'field_type': 'instantaneous', 'Conventions': 'CF-1.6 NAVO_netcdf_v1.1'}}


In [22]:
import json

# Store metadata as JSON
metadata_json = json.dumps(metadata, indent=4)
with open('metadata.json', 'w') as f:
    f.write(metadata_json)

In [25]:
!pip install langchain openai langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Downloading langchain_community-0.2.16-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_community
Successfully installed langchain_community-0.2.16


In [27]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Initialize OpenAI chat-based LLM (using gpt-3.5-turbo)
chat_llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")

# Create a prompt template to query the metadata
template = """
You are an assistant that helps users explore metadata of NetCDF files. Below is the metadata:

{metadata}

Answer the following question: {question}
"""

prompt = PromptTemplate(
    input_variables=["metadata", "question"],
    template=template
)

# Create a chain to query the metadata
metadata_chain = LLMChain(llm=chat_llm, prompt=prompt)

# Example of querying metadata
query = "What are the variables in the NetCDF file?"
response = metadata_chain.run({
    "metadata": metadata_json,
    "question": query
})

print(response)

  chat_llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")


The variables in the NetCDF file are: 
1. time
2. tau
3. depth
4. lat
5. lon
6. water_u
7. water_v
8. water_temp
9. salinity
10. surf_el


In [28]:
# List of queries you want to test
queries = [
    "What are the variables in the NetCDF file?",
    "What is the time range of the data in the NetCDF file?",
    "What is the spatial resolution of the data in the file?",
    "Are there any missing values in the NetCDF file?",
    "What is the depth range covered in this dataset?"
]

# Loop through each question and query the metadata
for query in queries:
    response = metadata_chain.run({
        "metadata": metadata_json,
        "question": query
    })

    print(f"Question: {query}")
    print(f"Response: {response}\n")

Question: What are the variables in the NetCDF file?
Response: The variables in the NetCDF file are:
1. time
2. tau
3. depth
4. lat
5. lon
6. water_u
7. water_v
8. water_temp
9. salinity
10. surf_el

Question: What is the time range of the data in the NetCDF file?
Response: The time variable in the NetCDF file represents the time dimension. Without specific values provided, we cannot determine the exact time range of the data in the file.

Question: What is the spatial resolution of the data in the file?
Response: The spatial resolution of the data in the file can be determined by looking at the dimensions "lat" and "lon". In this case, the dimensions are lat and lon, which typically represent latitude and longitude coordinates. The resolution of the data would depend on the spacing between the latitude and longitude values in the dataset. Unfortunately, the specific values for the latitude and longitude spacing are not provided in the metadata, so the exact spatial resolution cannot b

In [29]:
import netCDF4 as nc

# Function to extract metadata from the NetCDF file
def extract_metadata(nc_file):
    # Open the NetCDF file
    dataset = nc.Dataset(nc_file)

    # Extract basic information
    variables = list(dataset.variables.keys())
    time_range = None
    if 'time' in dataset.variables:
        time_range = (dataset.variables['time'][:].min(), dataset.variables['time'][:].max())

    latitudes = dataset.variables['lat'][:] if 'lat' in dataset.variables else None
    longitudes = dataset.variables['lon'][:] if 'lon' in dataset.variables else None
    depth_range = None
    if 'depth' in dataset.variables:
        depth_range = (dataset.variables['depth'][:].min(), dataset.variables['depth'][:].max())

    # Spatial resolution (assuming lat and lon are 1D arrays)
    lat_resolution = latitudes[1] - latitudes[0] if latitudes is not None and len(latitudes) > 1 else None
    lon_resolution = longitudes[1] - longitudes[0] if longitudes is not None and len(longitudes) > 1 else None

    # Check for missing values in each variable
    missing_value_info = {}
    for var in variables:
        if hasattr(dataset.variables[var], '_FillValue'):
            missing_value_info[var] = dataset.variables[var].__dict__.get('_FillValue', None)

    # Create a metadata dictionary
    metadata = {
        "variables": variables,
        "time_range": time_range,
        "lat_resolution": lat_resolution,
        "lon_resolution": lon_resolution,
        "depth_range": depth_range,
        "missing_value_info": missing_value_info
    }

    return metadata

# Function to handle specific queries based on extracted metadata
def handle_query(query, metadata):
    if "variables" in query.lower():
        return f"The variables in the NetCDF file are: {', '.join(metadata['variables'])}"

    elif "time range" in query.lower():
        if metadata['time_range']:
            return f"The time range of the data is from {metadata['time_range'][0]} to {metadata['time_range'][1]}."
        else:
            return "No time data available."

    elif "spatial resolution" in query.lower():
        if metadata['lat_resolution'] and metadata['lon_resolution']:
            return f"The spatial resolution is approximately {metadata['lat_resolution']} degrees in latitude and {metadata['lon_resolution']} degrees in longitude."
        else:
            return "No spatial resolution data available."

    elif "missing values" in query.lower():
        if metadata['missing_value_info']:
            missing_info = ', '.join([f"{var}: {val}" for var, val in metadata['missing_value_info'].items()])
            return f"The following variables have missing values: {missing_info}."
        else:
            return "There are no missing values in the variables."

    elif "depth range" in query.lower():
        if metadata['depth_range']:
            return f"The depth range in this dataset is from {metadata['depth_range'][0]} to {metadata['depth_range'][1]} meters."
        else:
            return "No depth data available."

    else:
        return "Query not recognized or supported."

# Example of running multiple queries on the metadata
def run_queries(nc_file, queries):
    # Extract metadata from the NetCDF file
    metadata = extract_metadata(nc_file)

    # Handle each query and print the response
    for query in queries:
        print(f"Question: {query}")
        response = handle_query(query, metadata)
        print(f"Response: {response}\n")

# Define your queries
queries = [
    "What are the variables in the NetCDF file?",
    "What is the time range of the data in the NetCDF file?",
    "What is the spatial resolution of the data in the file?",
    "Are there any missing values in the NetCDF file?",
    "What is the depth range covered in this dataset?"
]

# Path to your NetCDF file
nc_file = '/content/gom_t008.nc'

# Run the queries
run_queries(nc_file, queries)

Question: What are the variables in the NetCDF file?
Response: The variables in the NetCDF file are: time, tau, depth, lat, lon, water_u, water_v, water_temp, salinity, surf_el

Question: What is the time range of the data in the NetCDF file?
Response: The time range of the data is from 192884.00000000006 to 192884.00000000006.

Question: What is the spatial resolution of the data in the file?
Response: The spatial resolution is approximately 0.03999900817871094 degrees in latitude and 0.03997802734375 degrees in longitude.

Question: Are there any missing values in the NetCDF file?
Response: The following variables have missing values: water_u: -30000, water_v: -30000, water_temp: -30000, salinity: -30000, surf_el: -30000.

Question: What is the depth range covered in this dataset?
Response: The depth range in this dataset is from 0.0 to 5000.0 meters.

