In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# --- INSTALL DEPENDENCIES ---
!pip install sentence-transformers chromadb plotly xarray pandas pyarrow fastparquet --quiet

import os
import pandas as pd
import xarray as xr
from sentence_transformers import SentenceTransformer
import chromadb
import plotly.express as px

# --- STEP 1: SETUP PATHS ---
drive_base = "/content/drive/MyDrive/ColabNotebooks/SIH2025/Data"
filtered_dir = os.path.join(drive_base, "filtered_argo_data")
os.makedirs(filtered_dir, exist_ok=True)

# --- STEP 2: LOAD CATALOG (metadata for vector store) ---
catalog_path = os.path.join(drive_base, "argo_metadata_catalog.csv")
catalog = pd.read_csv(catalog_path)
print("📑 Catalog loaded:", catalog.shape)

# --- STEP 3: INITIALIZE VECTOR STORE ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
collection_name = "argo_metadata"

# Delete existing collection if any
try:
    client.delete_collection(name=collection_name)
except:
    pass

collection = client.create_collection(name=collection_name)

# --- STEP 4: INSERT METADATA EMBEDDINGS ---
docs, metas = [], []
for idx, row in catalog.iterrows():
    text = (
        f"File {row['file_path']} contains {row['num_rows']} measurements "
        f"from {row['year']}-{row['month']} "
        f"in region lat[{row['lat_min']}, {row['lat_max']}] "
        f"lon[{row['lon_min']}, {row['lon_max']}] "
        f"with depth range {row['depth_min']}–{row['depth_max']} meters."
    )
    docs.append(text)
    metas.append(row.to_dict())

# Generate embeddings and add to Chroma
embeddings = embedder.encode(docs).tolist()
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=[str(i) for i in range(len(docs))]
)
print(f"✅ Inserted {len(docs)} metadata entries into ChromaDB.")

# --- STEP 5: HYBRID RETRIEVAL FUNCTION ---
# --- STEP 5: FIXED HYBRID RETRIEVE FUNCTION ---
def hybrid_retrieve(query_text, year=None, lat_min=None, lat_max=None,
                    lon_min=None, lon_max=None, depth_min=None, depth_max=None, top_k=5):
    # Semantic retrieval
    query_emb = embedder.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)

    if not results['metadatas'] or len(results['metadatas'][0]) == 0:
        return pd.DataFrame()

    matched_files = pd.DataFrame(results['metadatas'][0])

    # Only apply filters if the column exists
    if 'year' in matched_files.columns and year is not None:
        matched_files = matched_files[matched_files['year'] == year]
    if 'lat_max' in matched_files.columns and lat_min is not None:
        matched_files = matched_files[matched_files['lat_max'] >= lat_min]
    if 'lat_min' in matched_files.columns and lat_max is not None:
        matched_files = matched_files[matched_files['lat_min'] <= lat_max]
    if 'lon_max' in matched_files.columns and lon_min is not None:
        matched_files = matched_files[matched_files['lon_max'] >= lon_min]
    if 'lon_min' in matched_files.columns and lon_max is not None:
        matched_files = matched_files[matched_files['lon_min'] <= lon_max]
    if 'depth_max' in matched_files.columns and depth_min is not None:
        matched_files = matched_files[matched_files['depth_max'] >= depth_min]
    if 'depth_min' in matched_files.columns and depth_max is not None:
        matched_files = matched_files[matched_files['depth_min'] <= depth_max]

    return matched_files


# --- STEP 6: LAZY DATA LOADER ---
def load_file_lazy(file_path):
    try:
        if file_path.endswith(".nc"):
            ds = xr.open_dataset(file_path, decode_times=False)
            df = ds.to_dataframe().reset_index()
        elif file_path.endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            return pd.DataFrame()
        df["source_file"] = file_path
        return df
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return pd.DataFrame()

# --- STEP 7: QUERY + PLOT FUNCTION ---
def get_data_and_plot(query_text, year=None, lat_min=None, lat_max=None,
                      lon_min=None, lon_max=None, depth_min=None, depth_max=None,
                      var="TEMP", max_rows=5000):
    files = hybrid_retrieve(query_text, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max)
    if files.empty:
        print("⚠️ No matching files found for this query")
        return pd.DataFrame()

    print(f"📂 Loading {len(files)} matching files...")
    all_data = []
    for f in files['file_path']:
        if os.path.exists(f):
            df = load_file_lazy(f)
            if not df.empty:
                all_data.append(df)
    if not all_data:
        print("⚠️ No data loaded")
        return pd.DataFrame()

    df = pd.concat(all_data, ignore_index=True)

    # Sample for plotting
    df_sample = df.sample(n=min(len(df), max_rows), random_state=42)

    # Plot variable
    if var in df_sample.columns and "DEPTH_M" in df_sample.columns:
        fig = px.scatter(df_sample, x="LONGITUDE", y="DEPTH_M", color=var,
                         color_continuous_scale="Viridis", height=600,
                         title=f"{var} vs DEPTH_M")
        fig.update_yaxes(autorange="reversed")
        fig.show()

    return df

# --- STEP 8: EXAMPLE QUERIES ---
# Query 1: Temperature profiles in 2023
df2023_temp = get_data_and_plot("temperature profiles", year=2023, lat_min=-20, lat_max=20, lon_min=30, lon_max=60, depth_min=0, depth_max=2000, var="TEMP")

# Query 2: Salinity profiles in 2022
df2022_psal = get_data_and_plot("salinity profiles", year=2022, lat_min=-20, lat_max=20, lon_min=30, lon_max=60, depth_min=0, depth_max=2000, var="PSAL")


📑 Catalog loaded: (36, 10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Inserted 36 metadata entries into ChromaDB.
📂 Loading 1 matching files...


📂 Loading 3 matching files...


In [None]:
# --- INSTALL DEPENDENCIES ---
!pip install sentence-transformers chromadb plotly xarray pandas pyarrow fastparquet --quiet

import os
import pandas as pd
import xarray as xr
from sentence_transformers import SentenceTransformer
import chromadb
import plotly.express as px

# --- STEP 1: SETUP PATHS ---
drive_base = "/content/drive/MyDrive/ColabNotebooks/SIH2025/Data"
filtered_dir = os.path.join(drive_base, "filtered_argo_data")
os.makedirs(filtered_dir, exist_ok=True)

# --- STEP 2: LOAD CATALOG (metadata for vector store) ---
catalog_path = os.path.join(drive_base, "argo_metadata_catalog.csv")
catalog = pd.read_csv(catalog_path)
print("📑 Catalog loaded:", catalog.shape)

# --- STEP 3: INITIALIZE VECTOR STORE ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
collection_name = "argo_metadata"

# Delete existing collection if any
try:
    client.delete_collection(name=collection_name)
except:
    pass

collection = client.create_collection(name=collection_name)

# --- STEP 4: INSERT METADATA EMBEDDINGS ---
docs, metas = [], []
for idx, row in catalog.iterrows():
    text = (
        f"File {row['file_path']} contains {row['num_rows']} measurements "
        f"from {row['year']}-{row['month']} "
        f"in region lat[{row['lat_min']}, {row['lat_max']}] "
        f"lon[{row['lon_min']}, {row['lon_max']}] "
        f"with depth range {row['depth_min']}–{row['depth_max']} meters."
    )
    docs.append(text)
    metas.append(row.to_dict())

# Generate embeddings and add to Chroma
embeddings = embedder.encode(docs).tolist()
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=[str(i) for i in range(len(docs))]
)
print(f"✅ Inserted {len(docs)} metadata entries into ChromaDB.")

# --- STEP 5: HYBRID RETRIEVAL FUNCTION ---
# --- STEP 5: FIXED HYBRID RETRIEVE FUNCTION ---
def hybrid_retrieve(query_text, year=None, lat_min=None, lat_max=None,
                    lon_min=None, lon_max=None, depth_min=None, depth_max=None, top_k=5):
    # Semantic retrieval
    query_emb = embedder.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)

    if not results['metadatas'] or len(results['metadatas'][0]) == 0:
        return pd.DataFrame()

    matched_files = pd.DataFrame(results['metadatas'][0])

    # Only apply filters if the column exists
    if 'year' in matched_files.columns and year is not None:
        matched_files = matched_files[matched_files['year'] == year]
    if 'lat_max' in matched_files.columns and lat_min is not None:
        matched_files = matched_files[matched_files['lat_max'] >= lat_min]
    if 'lat_min' in matched_files.columns and lat_max is not None:
        matched_files = matched_files[matched_files['lat_min'] <= lat_max]
    if 'lon_max' in matched_files.columns and lon_min is not None:
        matched_files = matched_files[matched_files['lon_max'] >= lon_min]
    if 'lon_min' in matched_files.columns and lon_max is not None:
        matched_files = matched_files[matched_files['lon_min'] <= lon_max]
    if 'depth_max' in matched_files.columns and depth_min is not None:
        matched_files = matched_files[matched_files['depth_max'] >= depth_min]
    if 'depth_min' in matched_files.columns and depth_max is not None:
        matched_files = matched_files[matched_files['depth_min'] <= depth_max]

    return matched_files


# --- STEP 6: LAZY DATA LOADER ---
def load_file_lazy(file_path):
    try:
        if file_path.endswith(".nc"):
            ds = xr.open_dataset(file_path, decode_times=False)
            df = ds.to_dataframe().reset_index()
        elif file_path.endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            return pd.DataFrame()
        df["source_file"] = file_path
        return df
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return pd.DataFrame()

# --- STEP 7: QUERY + PLOT FUNCTION ---
def get_data_and_plot(query_text, year=None, lat_min=None, lat_max=None,
                      lon_min=None, lon_max=None, depth_min=None, depth_max=None,
                      var="TEMP", max_rows=5000):
    files = hybrid_retrieve(query_text, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max)
    if files.empty:
        print("⚠️ No matching files found for this query")
        return pd.DataFrame()

    print(f"📂 Loading {len(files)} matching files...")
    all_data = []
    for f in files['file_path']:
        if os.path.exists(f):
            df = load_file_lazy(f)
            if not df.empty:
                all_data.append(df)
    if not all_data:
        print("⚠️ No data loaded")
        return pd.DataFrame()

    df = pd.concat(all_data, ignore_index=True)

    # Sample for plotting
    df_sample = df.sample(n=min(len(df), max_rows), random_state=42)

    # Plot variable
    if var in df_sample.columns and "DEPTH_M" in df_sample.columns:
        fig = px.scatter(df_sample, x="LONGITUDE", y="DEPTH_M", color=var,
                         color_continuous_scale="Viridis", height=600,
                         title=f"{var} vs DEPTH_M")
        fig.update_yaxes(autorange="reversed")
        fig.show()

    return df

# --- STEP 8: EXAMPLE QUERIES ---
# Query 1: Temperature profiles in 2023
df2023_temp = get_data_and_plot("temperature profiles", year=2023, lat_min=-20, lat_max=20, lon_min=30, lon_max=60, depth_min=0, depth_max=2000, var="TEMP")

# Query 2: Salinity profiles in 2022
df2022_psal = get_data_and_plot("salinity profiles", year=2022, lat_min=-20, lat_max=20, lon_min=30, lon_max=60, depth_min=0, depth_max=2000, var="PSAL")


📑 Catalog loaded: (36, 10)
✅ Inserted 36 metadata entries into ChromaDB.
📂 Loading 1 matching files...


📂 Loading 3 matching files...


In [2]:
# --- STEP 9: CONVERSATIONAL LOOP ---

import re

def parse_query(query):
    """
    Basic query parser to extract:
    - variable (TEMP / PSAL)
    - year
    - latitude / longitude / depth ranges
    """
    query = query.lower()

    # Variable
    var = "TEMP" if "temp" in query or "temperature" in query else "PSAL" if "salinity" in query else "TEMP"

    # Year
    year_match = re.search(r"\b(20\d{2})\b", query)
    year = int(year_match.group(0)) if year_match else None

    # Depth
    depth_match = re.search(r"(\d{1,4})m", query)
    depth_min, depth_max = (0, int(depth_match.group(1))) if depth_match else (0, 2000)

    # Latitude / Longitude - simple placeholders
    lat_min, lat_max = -20, 20
    lon_min, lon_max = 30, 60

    return var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max

def conversational_loop():
    print("🌊 Argo Data Assistant is ready! Type 'exit' to quit.\n")
    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Parse query
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        # Retrieve & plot
        print(f"\n🔹 Processing your query: {user_input}")
        df = get_data_and_plot(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max,
            var=var
        )

        if not df.empty:
            print(f"📊 Retrieved {len(df)} rows (sampled for plotting).")
            # Summary
            if var in df.columns:
                mean_val = df[var].mean()
                min_val = df[var].min()
                max_val = df[var].max()
                print(f"📌 {var} summary -> mean: {mean_val:.2f}, min: {min_val:.2f}, max: {max_val:.2f}\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN THE CONVERSATIONAL LOOP ---
conversational_loop()


🌊 Argo Data Assistant is ready! Type 'exit' to quit.

You: temp 2023

🔹 Processing your query: temp 2023
📂 Loading 2 matching files...


📊 Retrieved 3508868 rows (sampled for plotting).
📌 TEMP summary -> mean: 9.75, min: 0.00, max: 58.63

You: salinity

🔹 Processing your query: salinity
📂 Loading 5 matching files...


📊 Retrieved 8182912 rows (sampled for plotting).
📌 PSAL summary -> mean: 34.33, min: 0.00, max: 131.75

You: exit
👋 Exiting Argo Data Assistant.


In [None]:
import os
import pandas as pd
import xarray as xr
import plotly.express as px
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb

# --- SETUP PATHS ---
drive_base = "/content/drive/MyDrive/ColabNotebooks/SIH2025/Data"
filtered_dir = os.path.join(drive_base, "filtered_argo_data")
os.makedirs(filtered_dir, exist_ok=True)

# --- LOAD CATALOG ---
catalog_path = os.path.join(drive_base, "argo_metadata_catalog.csv")
catalog = pd.read_csv(catalog_path)
print("📑 Catalog loaded:", catalog.shape)

# --- INITIALIZE VECTOR STORE ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
collection_name = "argo_metadata"

try:
    client.delete_collection(name=collection_name)
except:
    pass
collection = client.create_collection(name=collection_name)

# --- INSERT METADATA EMBEDDINGS ---
docs, metas = [], []
for idx, row in catalog.iterrows():
    text = (
        f"File {row['file_path']} contains {row['num_rows']} measurements "
        f"from {row['year']}-{row['month']} "
        f"in region lat[{row['lat_min']}, {row['lat_max']}] "
        f"lon[{row['lon_min']}, {row['lon_max']}] "
        f"with depth range {row['depth_min']}–{row['depth_max']} meters."
    )
    docs.append(text)
    metas.append(row.to_dict())

embeddings = embedder.encode(docs).tolist()
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=[str(i) for i in range(len(docs))]
)
print(f"✅ Inserted {len(docs)} metadata entries into ChromaDB.")

# --- PLOTTING FUNCTION ---
def plot_variable(df, var="TEMP", depth_col="DEPTH_M", interactive=True):
    if df.empty or var not in df.columns or depth_col not in df.columns:
        print("⚠️ Cannot plot: missing data or columns")
        return

    if interactive:
        fig = px.scatter(df, x="LONGITUDE", y=depth_col, color=var,
                         color_continuous_scale="Viridis", height=600,
                         title=f"{var} vs {depth_col}")
        fig.update_yaxes(autorange="reversed")
        fig.show()
    else:
        import matplotlib.pyplot as plt
        plt.scatter(df["LONGITUDE"], df[depth_col], c=df[var], cmap="viridis", s=5)
        plt.gca().invert_yaxis()
        plt.xlabel("Longitude")
        plt.ylabel(depth_col)
        plt.title(f"{var} vs {depth_col}")
        plt.colorbar(label=var)
        plt.show()

# --- HYBRID RETRIEVAL ---
def hybrid_retrieve(query_text, year=None, lat_min=None, lat_max=None,
                    lon_min=None, lon_max=None, depth_min=None, depth_max=None, top_k=5):
    query_emb = embedder.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)
    matched_files = pd.DataFrame(results['metadatas'][0])

    # Apply filters safely
    if year is not None:
        matched_files = matched_files[matched_files['year'] == year]
    if lat_min is not None:
        matched_files = matched_files[matched_files['lat_max'] >= lat_min]
    if lat_max is not None:
        matched_files = matched_files[matched_files['lat_min'] <= lat_max]
    if lon_min is not None:
        matched_files = matched_files[matched_files['lon_max'] >= lon_min]
    if lon_max is not None:
        matched_files = matched_files[matched_files['lon_min'] <= lon_max]
    if depth_min is not None:
        matched_files = matched_files[matched_files['depth_max'] >= depth_min]
    if depth_max is not None:
        matched_files = matched_files[matched_files['depth_min'] <= depth_max]

    return matched_files

# --- LAZY FILE LOADER ---
def load_file_lazy(file_path):
    try:
        if file_path.endswith(".nc"):
            ds = xr.open_dataset(file_path, decode_times=False)
            df = ds.to_dataframe().reset_index()
        elif file_path.endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            return pd.DataFrame()
        df["source_file"] = file_path
        return df
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return pd.DataFrame()

# --- QUERY + PLOT FUNCTION ---
def get_data_for_query(query_text, year=None, lat_min=None, lat_max=None,
                       lon_min=None, lon_max=None, depth_min=None, depth_max=None,
                       var="TEMP", max_rows=5000):
    files = hybrid_retrieve(query_text, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max)
    if files.empty:
        print("⚠️ No matching files found for this query")
        return pd.DataFrame()

    all_data = []
    for f in files['file_path']:
        if os.path.exists(f):
            df = load_file_lazy(f)
            if not df.empty:
                all_data.append(df)
    if not all_data:
        print("⚠️ No data loaded")
        return pd.DataFrame()

    df = pd.concat(all_data, ignore_index=True)
    return df

# --- AUTOMATED TREND INSIGHTS ---
def generate_insights(df, var, depth_col="DEPTH_M"):
    insights = []
    if df.empty or var not in df.columns or depth_col not in df.columns:
        return ["⚠️ No valid data to generate insights."]

    mean_val = df[var].mean()
    min_val = df[var].min()
    max_val = df[var].max()
    insights.append(f"📊 {var} summary: mean={mean_val:.2f}, min={min_val:.2f}, max={max_val:.2f}")

    correlation = np.corrcoef(df[depth_col], df[var])[0,1]
    if correlation > 0.1:
        insights.append(f"📈 {var} tends to increase with depth.")
    elif correlation < -0.1:
        insights.append(f"📉 {var} tends to decrease with depth.")
    else:
        insights.append(f"⚖️ {var} shows no strong trend with depth.")

    return insights

# --- CONVERSATIONAL LOOP ---
def conversational_loop_with_insights():
    print("🌊 Argo Data Assistant is ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Simple query parsing (example)
        # Extract variable and year from input like "temp vs depth 2023"
        parts = user_input.lower().split()
        var = "TEMP" if "temp" in parts else "PSAL" if "salinity" in parts else "TEMP"
        year = int([p for p in parts if p.isdigit()][0]) if any(p.isdigit() for p in parts) else None

        print(f"\n🔹 Processing your query: {user_input}")

        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=-20, lat_max=20,
            lon_min=30, lon_max=60,
            depth_min=0, depth_max=2000,
            var=var
        )

        if not df.empty:
            df_sample = df.sample(min(5000, len(df)), random_state=42)
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)
            insights = generate_insights(df, var, depth_col="DEPTH_M")
            for line in insights:
                print(line)
            print("\n✅ Query processed. You can enter a new query.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN LOOP ---
conversational_loop_with_insights()


📑 Catalog loaded: (36, 10)
✅ Inserted 36 metadata entries into ChromaDB.
🌊 Argo Data Assistant is ready! Type 'exit' to quit.

You: salinity 2022 depth 0-2000salinity 2022 depth 0-2000

🔹 Processing your query: salinity 2022 depth 0-2000salinity 2022 depth 0-2000


📊 PSAL summary: mean=34.21, min=0.00, max=176.95
⚖️ PSAL shows no strong trend with depth.

✅ Query processed. You can enter a new query.

You: Show me temperature profiles in 2023 between lat -20 and 20

🔹 Processing your query: Show me temperature profiles in 2023 between lat -20 and 20


📊 TEMP summary: mean=9.73, min=-1.43, max=58.63
⚖️ TEMP shows no strong trend with depth.

✅ Query processed. You can enter a new query.

You: exit
👋 Exiting Argo Data Assistant.


In [None]:
import numpy as np

# --- ADVANCED AUTOMATED INSIGHTS ---
def generate_advanced_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"):
    insights = []
    if df.empty:
        return ["⚠️ No data available for insights."]

    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()
        val_std = df[var].std()
        insights.append(f"📊 {var} stats: mean={val_mean:.2f}, min={val_min:.2f}, max={val_max:.2f}, std={val_std:.2f}")

        # Depth trend
        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"📈 {var} generally increases with depth.")
            elif corr < -0.1:
                insights.append(f"📉 {var} generally decreases with depth.")
            else:
                insights.append(f"⚖️ {var} shows no strong trend with depth.")

        # Extreme locations
        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(f"🏔 Highest {var}: {max_row[var]:.2f} at lat={max_row[lat_col]:.2f}, lon={max_row[lon_col]:.2f}, depth={max_row[depth_col]:.2f}")
            insights.append(f"🏞 Lowest {var}: {min_row[var]:.2f} at lat={min_row[lat_col]:.2f}, lon={min_row[lon_col]:.2f}, depth={min_row[depth_col]:.2f}")

    # Overall coverage
    insights.append(f"🌍 Data covers lat [{df[lat_col].min():.2f}, {df[lat_col].max():.2f}], lon [{df[lon_col].min():.2f}, {df[lon_col].max():.2f}], depth [{df[depth_col].min():.2f}, {df[depth_col].max():.2f}] meters.")

    return insights

# --- ENHANCED CONVERSATIONAL LOOP WITH ADVANCED INSIGHTS ---
def conversational_loop_advanced():
    print("🌊 Argo Data Assistant (Advanced) is ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Parse query
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")

        # Retrieve dataframe
        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max
        )

        if not df.empty:
            # Sample for faster plotting
            df_sample = df.sample(min(5000, len(df)), random_state=42)

            # Plot variable
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            # Generate advanced insights
            insights = generate_advanced_insights(df, variables=[var])
            print("\n📝 Insights:")
            for line in insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN LOOP ---
conversational_loop_advanced()


🌊 Argo Data Assistant (Advanced) is ready! Type 'exit' to quit.

You: salinity profile in 2023

🔹 Processing query: salinity profile in 2023



📝 Insights:
📊 PSAL stats: mean=34.44, min=0.00, max=131.75, std=3.31
⚖️ PSAL shows no strong trend with depth.
🏔 Highest PSAL: 131.75 at lat=2.17, lon=56.63, depth=nan
🏞 Lowest PSAL: 0.00 at lat=-32.50, lon=39.64, depth=0.00
🌍 Data covers lat [-39.98, 25.00], lon [30.10, 109.94], depth [-0.10, 5506.00] meters.

✅ Query processed. Enter another query or 'exit'.

You: exit
👋 Exiting Argo Data Assistant.


In [3]:
import numpy as np
import random

# --- FUNCTION: NATURAL LANGUAGE INSIGHTS ---
def generate_natural_language_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"):
    if df.empty:
        return ["⚠️ No data available for insights."]

    insights = []

    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()
        val_std = df[var].std()

        # Intro sentence
        insights.append(f"The average {var.lower()} in the selected region is {val_mean:.2f}, ranging from {val_min:.2f} to {val_max:.2f}.")

        # Depth trend
        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"{var} tends to increase with depth, suggesting deeper waters are generally higher in {var.lower()}.")
            elif corr < -0.1:
                insights.append(f"{var} tends to decrease with depth, indicating deeper waters are cooler/lower in {var.lower()}.")
            else:
                insights.append(f"There is no strong trend of {var} with depth in this dataset.")

        # Extreme values
        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(
                f"The highest {var.lower()} is {max_row[var]:.2f} observed at latitude {max_row[lat_col]:.2f}, longitude {max_row[lon_col]:.2f}, depth {max_row[depth_col]:.2f} meters."
            )
            insights.append(
                f"The lowest {var.lower()} is {min_row[var]:.2f} observed at latitude {min_row[lat_col]:.2f}, longitude {min_row[lon_col]:.2f}, depth {min_row[depth_col]:.2f} meters."
            )

    # Coverage sentence
    insights.append(f"Data covers latitudes from {df[lat_col].min():.2f} to {df[lat_col].max():.2f}, longitudes from {df[lon_col].min():.2f} to {df[lon_col].max():.2f}, and depths from {df[depth_col].min():.2f} to {df[depth_col].max():.2f} meters.")

    # Shuffle sentences slightly for natural feel
    random.shuffle(insights)
    return insights

# --- ENHANCED CONVERSATIONAL LOOP WITH NATURAL LANGUAGE ---
def conversational_loop_nl():
    print("🌊 Argo Data Assistant (Natural Language Insights) ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Parse query (you can reuse your existing parse_query function)
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")

        # Retrieve dataframe
        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max
        )

        if not df.empty:
            # Sample for faster plotting
            df_sample = df.sample(min(5000, len(df)), random_state=42)

            # Plot variable
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            # Generate natural language insights
            nl_insights = generate_natural_language_insights(df, variables=[var])
            print("\n📝 Natural Language Insights:")
            for line in nl_insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN THE CONVERSATIONAL LOOP ---
conversational_loop_nl()


🌊 Argo Data Assistant (Natural Language Insights) ready! Type 'exit' to quit.

You: salinity profile between 2023 and 2024

🔹 Processing query: salinity profile between 2023 and 2024


NameError: name 'get_data_for_query' is not defined

In [None]:
import plotly.express as px

# --- FUNCTION: MULTI-VARIABLE TREND INSIGHTS ---
def generate_multivariable_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M"):
    if df.empty:
        return ["⚠️ No data available for multi-variable insights."]

    insights = []

    # Pairwise correlations
    for var in variables:
        if var not in df.columns:
            continue

        # Correlation with depth
        corr = df[var].corr(df[depth_col])
        if corr > 0.1:
            insights.append(f"{var} generally increases with depth (correlation={corr:.2f}).")
        elif corr < -0.1:
            insights.append(f"{var} generally decreases with depth (correlation={corr:.2f}).")
        else:
            insights.append(f"{var} shows no clear trend with depth (correlation={corr:.2f}).")

    # Variable-to-variable correlations
    if len(variables) > 1:
        for i in range(len(variables)):
            for j in range(i+1, len(variables)):
                v1, v2 = variables[i], variables[j]
                if v1 in df.columns and v2 in df.columns:
                    corr_v = df[v1].corr(df[v2])
                    if corr_v > 0.1:
                        insights.append(f"{v1} and {v2} are positively correlated (corr={corr_v:.2f}).")
                    elif corr_v < -0.1:
                        insights.append(f"{v1} and {v2} are negatively correlated (corr={corr_v:.2f}).")
                    else:
                        insights.append(f"{v1} and {v2} show little correlation (corr={corr_v:.2f}).")

    return insights

# --- FUNCTION: PLOT MULTI-VARIABLE TRENDS ---
def plot_multivariable_trends(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M"):
    for var in variables:
        if var not in df.columns:
            continue
        fig = px.scatter(df, x=var, y=depth_col, color=var, color_continuous_scale="Viridis",
                         title=f"{var} vs {depth_col}", height=600)
        fig.update_yaxes(autorange="reversed")  # Depth increases downward
        fig.show()

    # Pairwise scatter plots
    if len(variables) > 1:
        for i in range(len(variables)):
            for j in range(i+1, len(variables)):
                v1, v2 = variables[i], variables[j]
                if v1 in df.columns and v2 in df.columns:
                    fig2 = px.scatter(df, x=v1, y=v2, color="DEPTH_M", color_continuous_scale="Viridis",
                                      title=f"{v1} vs {v2} colored by Depth", height=600)
                    fig2.show()

# --- ENHANCED CONVERSATIONAL LOOP WITH MULTI-VARIABLE INSIGHTS ---
def conversational_loop_multivariable():
    print("🌊 Argo Multi-Variable Assistant ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Multi-Variable Assistant.")
            break

        # Parse query
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)
        # Retrieve data
        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max
        )

        if not df.empty:
            df_sample = df.sample(min(5000, len(df)), random_state=42)

            # Plot single-variable trends
            plot_multivariable_trends(df_sample, variables=["TEMP", "PSAL"], depth_col="DEPTH_M")

            # Generate multi-variable insights
            insights = generate_multivariable_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M")
            print("\n📝 Multi-Variable Natural Language Insights:")
            for line in insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN THE LOOP ---
conversational_loop_multivariable()


🌊 Argo Multi-Variable Assistant ready! Type 'exit' to quit.

You: give temp data of 2024
⚠️ No matching files found for this query
⚠️ No data available for this query.

You: temp 2024
⚠️ No matching files found for this query
⚠️ No data available for this query.

You: depth 2024



📝 Multi-Variable Natural Language Insights:
TEMP generally decreases with depth (correlation=-0.81).
PSAL shows no clear trend with depth (correlation=0.01).
TEMP and PSAL show little correlation (corr=0.03).

✅ Query processed. Enter another query or 'exit'.

You: temp vs depth 2023



📝 Multi-Variable Natural Language Insights:
TEMP generally decreases with depth (correlation=-0.81).
PSAL shows no clear trend with depth (correlation=-0.01).
TEMP and PSAL show little correlation (corr=0.06).

✅ Query processed. Enter another query or 'exit'.

You: exit
👋 Exiting Argo Multi-Variable Assistant.


In [None]:
# --- INSTALL DEPENDENCIES ---
!pip install chromadb openai plotly pandas

# --- IMPORTS ---
import os
import json
import pandas as pd
import plotly.express as px
import chromadb
from chromadb.config import Settings
from openai import OpenAI

# --- API KEYS & ENDPOINTS ---
OPENROUTER_API_KEY = "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"  # 🔑 replace with your key
OPENROUTER_BASE = "https://openrouter.ai/api/v1"

# --- INIT OPENROUTER CLIENT ---
deepseek_client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url=OPENROUTER_BASE,
)

# --- INIT VECTOR DB (CHROMA) ---
chroma_client = chromadb.PersistentClient(path="chroma_store")
collection = chroma_client.get_or_create_collection("argo_data")

# --- LOAD ARGO DATA (EXAMPLE DATAFRAME) ---
# Replace with your parquet/CSV loader
df = pd.DataFrame({
    "year": [2023, 2023, 2024, 2024],
    "depth": [100, 500, 100, 500],
    "temperature": [15.2, 10.5, 15.6, 11.0],
    "salinity": [35.1, 34.7, 35.3, 34.9],
})

# --- INGEST DATA INTO VECTOR DB ---
for i, row in df.iterrows():
    doc = f"Year {row['year']}, Depth {row['depth']}m, Temp {row['temperature']}°C, Salinity {row['salinity']} PSU"
    collection.upsert(
        documents=[doc],
        ids=[f"row_{i}"],
        metadatas=[{
            "year": row["year"],
            "depth": row["depth"],
            "temperature": row["temperature"],
            "salinity": row["salinity"],
        }]
    )

# --- FUNCTION: RETRIEVE MATCHES ---
def retrieve_relevant_docs(query: str, n_results: int = 5):
    results = collection.query(query_texts=[query], n_results=n_results)
    return results["documents"][0] if "documents" in results else []

# --- FUNCTION: GENERATE SUMMARY WITH DEEPSEEK ---
def explain_data_nl(results, query):
    if not results:
        return "⚠️ No relevant data found."

    context = "\n".join(results)
    prompt = f"""
    You are an expert oceanographer.
    User asked: {query}
    Data context:
    {context}
    Provide a clear, concise summary.
    """

    resp = deepseek_client.chat.completions.create(
        model="deepseek/deepseek-r1-0528:free",
        messages=[{"role": "user", "content": prompt}],
    )

    return resp.choices[0].message.content

# --- FUNCTION: VISUALIZE DATA ---
def visualize_data(df, query):
    fig = px.scatter(
        df, x="depth", y="temperature", color="year",
        size="salinity", hover_data=["salinity"],
        title=f"Argo Data Visualization: {query}"
    )
    fig.show()

# --- MAIN LOOP ---
print("🌊 Welcome to the Argo Ocean Data Assistant (DeepSeek-powered) 🌊")
print("Type 'exit' to quit.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("👋 Goodbye!")
        break

    print(f"\n🔹 Processing your query: {user_input}")

    matches = retrieve_relevant_docs(user_input)
    if not matches:
        print("⚠️ No matching files found for this query")
        continue

    # Visualization (using toy df here)
    visualize_data(df, user_input)

    # Natural Language Explanation
    explanation = explain_data_nl(matches, user_input)
    print("\n🤖 DeepSeek Summary:\n", explanation, "\n")


🌊 Welcome to the Argo Ocean Data Assistant (DeepSeek-powered) 🌊
Type 'exit' to quit.

You: salinity profile in 2024

🔹 Processing your query: salinity profile in 2024


KeyboardInterrupt: 

In [None]:
import numpy as np
import random
import plotly.express as px
import re

def get_data_for_query(query_text, year=None, lat_min=None, lat_max=None,
                       lon_min=None, lon_max=None, depth_min=None, depth_max=None):
    # Example: Load parquet subset
    df = pd.read_parquet("argo_data_subset.parquet")

    if year:
        df = df[df["YEAR"] == year]
    if depth_min is not None:
        df = df[df["DEPTH_M"] >= depth_min]
    if depth_max is not None:
        df = df[df["DEPTH_M"] <= depth_max]
    if lat_min is not None:
        df = df[df["LATITUDE"] >= lat_min]
    if lat_max is not None:
        df = df[df["LATITUDE"] <= lat_max]
    if lon_min is not None:
        df = df[df["LONGITUDE"] >= lon_min]
    if lon_max is not None:
        df = df[df["LONGITUDE"] <= lon_max]

    return df


# --- SIMPLE PARSER FUNCTION ---
def parse_query(query):
    """
    Extract variable, year, lat/lon/depth ranges from query text.
    Fallbacks are None if not found.
    """
    query = query.lower()

    # Variable detection
    if "salinity" in query or "psal" in query:
        var = "PSAL"
    elif "temperature" in query or "temp" in query:
        var = "TEMP"
    else:
        var = "TEMP"  # default

    # Year detection
    year_match = re.search(r"(19|20)\d{2}", query)
    year = int(year_match.group()) if year_match else None

    # Lat/Lon/Depth ranges (default None = full range)
    lat_min = lat_max = lon_min = lon_max = depth_min = depth_max = None

    return var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max

# --- FUNCTION: PLOT VARIABLE (accurate visualisation) ---
def plot_variable(df, var="TEMP", depth_col="DEPTH_M", interactive=True):
    if var not in df.columns or depth_col not in df.columns:
        print(f"⚠️ Variable {var} or depth column {depth_col} not found.")
        return

    if interactive:
        fig = px.scatter(
            df, x=var, y=depth_col, color=var,
            color_continuous_scale="Viridis",
            title=f"{var} vs Depth",
            height=600
        )
        fig.update_yaxes(autorange="reversed")  # Depth increases downward
        fig.show()
    else:
        df.plot(x=var, y=depth_col, kind="scatter")
        plt.gca().invert_yaxis()
        plt.title(f"{var} vs Depth")
        plt.show()

# --- FUNCTION: NATURAL LANGUAGE INSIGHTS ---
def generate_natural_language_insights(
    df, variables=["TEMP", "PSAL"],
    depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"
):
    if df.empty:
        return ["⚠️ No data available for insights."]

    insights = []

    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()

        insights.append(
            f"The average {var.lower()} is {val_mean:.2f}, ranging from {val_min:.2f} to {val_max:.2f}."
        )

        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"{var} tends to increase with depth (corr={corr:.2f}).")
            elif corr < -0.1:
                insights.append(f"{var} tends to decrease with depth (corr={corr:.2f}).")
            else:
                insights.append(f"No strong trend of {var} with depth (corr={corr:.2f}).")

        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(
                f"Highest {var.lower()} = {max_row[var]:.2f} at (lat={max_row[lat_col]:.2f}, lon={max_row[lon_col]:.2f}, depth={max_row[depth_col]:.2f}m)."
            )
            insights.append(
                f"Lowest {var.lower()} = {min_row[var]:.2f} at (lat={min_row[lat_col]:.2f}, lon={min_row[lon_col]:.2f}, depth={min_row[depth_col]:.2f}m)."
            )

    insights.append(
        f"Coverage: lat {df[lat_col].min():.2f}–{df[lat_col].max():.2f}, "
        f"lon {df[lon_col].min():.2f}–{df[lon_col].max():.2f}, "
        f"depth {df[depth_col].min():.2f}–{df[depth_col].max():.2f}m."
    )

    random.shuffle(insights)
    return insights

# --- CONVERSATIONAL LOOP ---
def conversational_loop():
    print("🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Parse query
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")

        # Retrieve dataframe (this depends on your existing data loader)
        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max
        )

        if not df.empty:
            df_sample = df.sample(min(5000, len(df)), random_state=42)

            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            nl_insights = generate_natural_language_insights(df, variables=[var])
            print("\n📝 Natural Language Insights:")
            for line in nl_insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN ---
conversational_loop()


🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.

You: depth 2023

🔹 Processing query: depth 2023


FileNotFoundError: [Errno 2] No such file or directory: 'argo_data_subset.parquet'

In [None]:
# --- INSTALL DEPENDENCIES ---
!pip install chromadb openai plotly pandas

# --- IMPORTS ---
import os
import pandas as pd
import plotly.express as px
import chromadb
from chromadb.config import Settings
from openai import OpenAI

# --- API KEYS & ENDPOINTS ---
OPENROUTER_API_KEY = "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"   # 🔑 replace with your key
OPENROUTER_BASE = "https://openrouter.ai/api/v1"

# --- INIT OPENROUTER CLIENT ---
deepseek_client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url=OPENROUTER_BASE,
)

# --- INIT VECTOR DB (CHROMA) ---
chroma_client = chromadb.PersistentClient(path="chroma_store")
collection = chroma_client.get_or_create_collection("argo_data")

# --- LOAD ARGO DATA (EXAMPLE DATAFRAME) ---
# Replace this with your parquet/CSV loader
df = pd.DataFrame({
    "year": [2023, 2023, 2024, 2024],
    "DEPTH_M": [100, 500, 100, 500],
    "TEMP": [15.2, 10.5, 15.6, 11.0],
    "PSAL": [35.1, 34.7, 35.3, 34.9],
})

# --- INGEST DATA INTO VECTOR DB ---
for i, row in df.iterrows():
    doc = f"Year {row['year']}, Depth {row['DEPTH_M']}m, Temp {row['TEMP']}°C, Salinity {row['PSAL']} PSU"
    collection.upsert(
        documents=[doc],
        ids=[f"row_{i}"],
        metadatas=[{
            "year": row["year"],
            "depth": row["DEPTH_M"],
            "temperature": row["TEMP"],
            "salinity": row["PSAL"],
        }]
    )

# --- FUNCTION: RETRIEVE MATCHES ---
def retrieve_relevant_docs(query: str, n_results: int = 5):
    results = collection.query(query_texts=[query], n_results=n_results)
    return results["documents"][0] if "documents" in results else []

# --- FUNCTION: GENERATE SUMMARY WITH DEEPSEEK ---
def explain_data_nl(results, query):
    if not results:
        return "⚠️ No relevant data found."

    context = "\n".join(results)
    prompt = f"""
    You are an expert oceanographer.
    User asked: {query}
    Data context:
    {context}
    Provide a clear, concise summary with insights.
    """

    resp = deepseek_client.chat.completions.create(
        model="deepseek/deepseek-r1-0528:free",
        messages=[{"role": "user", "content": prompt}],
    )

    return resp.choices[0].message.content

# --- FUNCTION: MULTI-VARIABLE INSIGHTS ---
def generate_multivariable_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M"):
    if df.empty:
        return ["⚠️ No data available for multi-variable insights."]

    insights = []

    for var in variables:
        if var not in df.columns:
            continue
        corr = df[var].corr(df[depth_col])
        if corr > 0.1:
            insights.append(f"{var} generally increases with depth (correlation={corr:.2f}).")
        elif corr < -0.1:
            insights.append(f"{var} generally decreases with depth (correlation={corr:.2f}).")
        else:
            insights.append(f"{var} shows no clear trend with depth (correlation={corr:.2f}).")

    if len(variables) > 1:
        for i in range(len(variables)):
            for j in range(i+1, len(variables)):
                v1, v2 = variables[i], variables[j]
                if v1 in df.columns and v2 in df.columns:
                    corr_v = df[v1].corr(df[v2])
                    if corr_v > 0.1:
                        insights.append(f"{v1} and {v2} are positively correlated (corr={corr_v:.2f}).")
                    elif corr_v < -0.1:
                        insights.append(f"{v1} and {v2} are negatively correlated (corr={corr_v:.2f}).")
                    else:
                        insights.append(f"{v1} and {v2} show little correlation (corr={corr_v:.2f}).")

    return insights

# --- FUNCTION: FIXED VISUALIZATIONS (ACCURATE) ---
def plot_multivariable_trends(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M"):
    for var in variables:
        if var not in df.columns:
            continue
        fig = px.line(df.sort_values(depth_col), x=depth_col, y=var, color="year",
                      markers=True, title=f"{var} Profile with Depth", height=600)
        fig.update_xaxes(title="Depth (m)", autorange="reversed")
        fig.update_yaxes(title=var)
        fig.show()

    if len(variables) > 1:
        for i in range(len(variables)):
            for j in range(i+1, len(variables)):
                v1, v2 = variables[i], variables[j]
                if v1 in df.columns and v2 in df.columns:
                    fig2 = px.scatter(df, x=v1, y=v2, color=depth_col,
                                      color_continuous_scale="Viridis",
                                      title=f"{v1} vs {v2} colored by Depth", height=600)
                    fig2.update_traces(marker=dict(size=10, opacity=0.7))
                    fig2.show()

# --- MAIN CONVERSATIONAL LOOP ---
print("🌊 Welcome to the Argo Ocean Data Assistant (DeepSeek + Accurate Visuals) 🌊")
print("Type 'exit' to quit.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("👋 Goodbye!")
        break

    print(f"\n🔹 Processing your query: {user_input}")

    matches = retrieve_relevant_docs(user_input)
    if not matches:
        print("⚠️ No matching data found for this query")
        continue

    # Visualization (sampled for performance)
    df_sample = df.sample(min(5000, len(df)), random_state=42)
    plot_multivariable_trends(df_sample, variables=["TEMP", "PSAL"], depth_col="DEPTH_M")

    # Multi-variable insights
    insights = generate_multivariable_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M")
    print("\n📝 Multi-Variable Insights:")
    for line in insights:
        print(line)

    # Natural Language Explanation
    explanation = explain_data_nl(matches, user_input)
    print("\n🤖 DeepSeek Summary:\n", explanation, "\n")


🌊 Welcome to the Argo Ocean Data Assistant (DeepSeek + Accurate Visuals) 🌊
Type 'exit' to quit.

You: salinity profile in 2024

🔹 Processing your query: salinity profile in 2024






This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.





📝 Multi-Variable Insights:
TEMP generally decreases with depth (correlation=-1.00).
PSAL generally decreases with depth (correlation=-0.89).
TEMP and PSAL are positively correlated (corr=0.93).

🤖 DeepSeek Summary:
 ### Summary of Salinity Profile in 2024
Based on the provided data, the salinity measurements for 2024 are as follows:  
- **At 100m depth**: **35.3 PSU** (Practical Salinity Units)  
- **At 500m depth**: **34.9 PSU**  

#### Key Insights:  
1. **Vertical Salinity Gradient**:  
   - Salinity decreases with depth: **100m (35.3 PSU) → 500m (34.9 PSU)**.  
   - A difference of **0.4 PSU** suggests stratification, where fresher surface inputs (e.g., rainfall) mix with saltier subsurface waters.  

2. **Year-over-Year Changes (vs. 2023)**:  
   - **At 100m**: Salinity **increased** by **0.2 PSU** (from 35.1 PSU in 2023).  
   - **At 500m**: Salinity **increased** by **0.2 PSU** (from 34.7 PSU in 2023).  
   - **Trend**: Both depths show a **consistent salinity increase of 0.2 P

In [None]:
import numpy as np
import random
import pandas as pd
import plotly.express as px
import re
import os
import xarray as xr

# -----------------------
# --- HELPER FUNCTIONS ---
# -----------------------

def parse_query(query):
    """
    Extract variable, year, lat/lon/depth ranges from query text.
    Fallbacks are None if not found.
    """
    query = query.lower()

    # Variable detection
    if "salinity" in query or "psal" in query:
        var = "PSAL"
    elif "temperature" in query or "temp" in query:
        var = "TEMP"
    else:
        var = "TEMP"  # default

    # Year detection
    year_match = re.search(r"(19|20)\d{2}", query)
    year = int(year_match.group()) if year_match else None

    # Lat/Lon/Depth ranges (default None = full range)
    lat_min = lat_max = lon_min = lon_max = depth_min = depth_max = None

    return var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max


def plot_variable(df, var="TEMP", depth_col="DEPTH_M", interactive=True):
    if var not in df.columns or depth_col not in df.columns:
        print(f"⚠️ Variable {var} or depth column {depth_col} not found.")
        return

    if interactive:
        fig = px.scatter(
            df, x=var, y=depth_col, color=var,
            color_continuous_scale="Viridis",
            title=f"{var} vs Depth",
            height=600
        )
        fig.update_yaxes(autorange="reversed")  # Depth increases downward
        fig.show()
    else:
        df.plot(x=var, y=depth_col, kind="scatter")
        plt.gca().invert_yaxis()
        plt.title(f"{var} vs Depth")
        plt.show()


def generate_natural_language_insights(
    df, variables=["TEMP", "PSAL"],
    depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"
):
    if df.empty:
        return ["⚠️ No data available for insights."]

    insights = []

    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()

        insights.append(
            f"The average {var.lower()} is {val_mean:.2f}, ranging from {val_min:.2f} to {val_max:.2f}."
        )

        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"{var} tends to increase with depth (corr={corr:.2f}).")
            elif corr < -0.1:
                insights.append(f"{var} tends to decrease with depth (corr={corr:.2f}).")
            else:
                insights.append(f"No strong trend of {var} with depth (corr={corr:.2f}).")

        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(
                f"Highest {var.lower()} = {max_row[var]:.2f} at (lat={max_row[lat_col]:.2f}, lon={max_row[lon_col]:.2f}, depth={max_row[depth_col]:.2f}m)."
            )
            insights.append(
                f"Lowest {var.lower()} = {min_row[var]:.2f} at (lat={min_row[lat_col]:.2f}, lon={min_row[lon_col]:.2f}, depth={min_row[depth_col]:.2f}m)."
            )

    insights.append(
        f"Coverage: lat {df[lat_col].min():.2f}–{df[lat_col].max():.2f}, "
        f"lon {df[lon_col].min():.2f}–{df[lon_col].max():.2f}, "
        f"depth {df[depth_col].min():.2f}–{df[depth_col].max():.2f}m."
    )

    random.shuffle(insights)
    return insights


# -----------------------
# --- CONVERSATIONAL LOOP ---
# -----------------------

def conversational_loop_hybrid():
    print("🌊 Argo Data Assistant (Hybrid Retrieval + NL Insights) ready! Type 'exit' to quit.\n")

    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        # Parse query
        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")

        # Retrieve data using your hybrid pipeline
        df = get_data_for_query(
            query_text=user_input,
            year=year,
            lat_min=lat_min,
            lat_max=lat_max,
            lon_min=lon_min,
            lon_max=lon_max,
            depth_min=depth_min,
            depth_max=depth_max
        )

        if not df.empty:
            # Sample for faster plotting
            df_sample = df.sample(min(5000, len(df)), random_state=42)

            # Plot
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            # Generate insights
            nl_insights = generate_natural_language_insights(df, variables=[var])
            print("\n📝 Natural Language Insights:")
            for line in nl_insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")

# --- RUN ---
conversational_loop_hybrid()


🌊 Argo Data Assistant (Hybrid Retrieval + NL Insights) ready! Type 'exit' to quit.

You: salinity 2023 lat=-10,10 lon=20,50 depth=0,2000

🔹 Processing query: salinity 2023 lat=-10,10 lon=20,50 depth=0,2000


FileNotFoundError: [Errno 2] No such file or directory: 'argo_data_subset.parquet'

In [None]:
import os
import random
import numpy as np
import pandas as pd
import xarray as xr
import plotly.express as px
from sentence_transformers import SentenceTransformer
import chromadb

# --- STEP 1: SETUP PATHS ---
drive_base = "/content/drive/MyDrive/ColabNotebooks/SIH2025/Data"
filtered_dir = os.path.join(drive_base, "filtered_argo_data")
os.makedirs(filtered_dir, exist_ok=True)

# --- STEP 2: LOAD CATALOG ---
catalog_path = os.path.join(drive_base, "argo_metadata_catalog.csv")
catalog = pd.read_csv(catalog_path)
print("📑 Catalog loaded:", catalog.shape)

# --- STEP 3: INITIALIZE VECTOR STORE ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
collection_name = "argo_metadata"

try:
    client.delete_collection(name=collection_name)
except:
    pass
collection = client.create_collection(name=collection_name)

# --- STEP 4: INSERT METADATA EMBEDDINGS ---
docs, metas = [], []
for idx, row in catalog.iterrows():
    text = (
        f"File {row['file_path']} contains {row['num_rows']} measurements "
        f"from {row['year']}-{row['month']} "
        f"in region lat[{row['lat_min']}, {row['lat_max']}] "
        f"lon[{row['lon_min']}, {row['lon_max']}] "
        f"with depth range {row['depth_min']}–{row['depth_max']} meters."
    )
    docs.append(text)
    metas.append(row.to_dict())

embeddings = embedder.encode(docs).tolist()
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=[str(i) for i in range(len(docs))]
)
print(f"✅ Inserted {len(docs)} metadata entries into ChromaDB.")


# --- STEP 5: HYBRID RETRIEVE FUNCTION ---
def hybrid_retrieve(query_text, year=None, lat_min=None, lat_max=None,
                    lon_min=None, lon_max=None, depth_min=None, depth_max=None, top_k=5):
    query_emb = embedder.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)

    if not results['metadatas'] or len(results['metadatas'][0]) == 0:
        return pd.DataFrame()

    matched_files = pd.DataFrame(results['metadatas'][0])

    # Apply filters safely
    if 'year' in matched_files.columns and year is not None:
        matched_files = matched_files[matched_files['year'] == year]
    if 'lat_max' in matched_files.columns and lat_min is not None:
        matched_files = matched_files[matched_files['lat_max'] >= lat_min]
    if 'lat_min' in matched_files.columns and lat_max is not None:
        matched_files = matched_files[matched_files['lat_min'] <= lat_max]
    if 'lon_max' in matched_files.columns and lon_min is not None:
        matched_files = matched_files[matched_files['lon_max'] >= lon_min]
    if 'lon_min' in matched_files.columns and lon_max is not None:
        matched_files = matched_files[matched_files['lon_min'] <= lon_max]
    if 'depth_max' in matched_files.columns and depth_min is not None:
        matched_files = matched_files[matched_files['depth_max'] >= depth_min]
    if 'depth_min' in matched_files.columns and depth_max is not None:
        matched_files = matched_files[matched_files['depth_min'] <= depth_max]

    return matched_files


# --- STEP 6: LAZY DATA LOADER ---
def load_file_lazy(file_path):
    try:
        if file_path.endswith(".nc"):
            ds = xr.open_dataset(file_path, decode_times=False)
            df = ds.to_dataframe().reset_index()
        elif file_path.endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            return pd.DataFrame()
        df["source_file"] = file_path
        return df
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return pd.DataFrame()


# --- STEP 7: GET DATA FUNCTION ---
def get_data_for_query(query_text, year=None, lat_min=None, lat_max=None,
                      lon_min=None, lon_max=None, depth_min=None, depth_max=None,
                      var="TEMP", max_rows=5000):
    files = hybrid_retrieve(query_text, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max)
    if files.empty:
        print("⚠️ No matching files found for this query")
        return pd.DataFrame()

    print(f"📂 Loading {len(files)} matching files...")
    all_data = []
    for f in files['file_path']:
        if os.path.exists(f):
            df = load_file_lazy(f)
            if not df.empty:
                all_data.append(df)
    if not all_data:
        print("⚠️ No data loaded")
        return pd.DataFrame()

    df = pd.concat(all_data, ignore_index=True)
    return df


# --- STEP 8: PLOT FUNCTION ---
def plot_variable(df, var="TEMP", depth_col="DEPTH_M", interactive=True):
    if var in df.columns and depth_col in df.columns:
        fig = px.scatter(
            df, x="LONGITUDE", y=depth_col, color=var,
            color_continuous_scale="Viridis", height=600,
            title=f"{var} vs {depth_col}"
        )
        fig.update_yaxes(autorange="reversed")
        if interactive:
            fig.show()


# --- STEP 9: NATURAL LANGUAGE INSIGHTS ---
def generate_natural_language_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"):
    if df.empty:
        return ["⚠️ No data available for insights."]

    insights = []
    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()
        val_std = df[var].std()

        insights.append(f"The average {var.lower()} is {val_mean:.2f}, ranging {val_min:.2f}–{val_max:.2f}.")

        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"{var} increases with depth → deeper waters higher in {var.lower()}.")
            elif corr < -0.1:
                insights.append(f"{var} decreases with depth → deeper waters lower in {var.lower()}.")
            else:
                insights.append(f"No strong trend of {var} with depth.")

        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(f"Highest {var.lower()} {max_row[var]:.2f} at ({max_row[lat_col]:.2f}, {max_row[lon_col]:.2f}), depth {max_row[depth_col]:.2f}m.")
            insights.append(f"Lowest {var.lower()} {min_row[var]:.2f} at ({min_row[lat_col]:.2f}, {min_row[lon_col]:.2f}), depth {min_row[depth_col]:.2f}m.")

    insights.append(f"Coverage: lat {df[lat_col].min():.2f}–{df[lat_col].max():.2f}, lon {df[lon_col].min():.2f}–{df[lon_col].max():.2f}, depth {df[depth_col].min():.2f}–{df[depth_col].max():.2f}m.")
    random.shuffle(insights)
    return insights


# --- STEP 10: QUERY PARSER (SIMPLE) ---
def parse_query(user_input):
    var = "TEMP" if "temp" in user_input.lower() else "PSAL" if "sal" in user_input.lower() else "TEMP"
    year = None
    for tok in user_input.split():
        if tok.isdigit() and len(tok) == 4:
            year = int(tok)
    return var, year, None, None, None, None, None, None


# --- STEP 11: CONVERSATIONAL LOOP ---
def conversational_loop_nl():
    print("🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.\n")
    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")
        df = get_data_for_query(user_input, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max, var=var)

        if not df.empty:
            df_sample = df.sample(min(5000, len(df)), random_state=42)
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            nl_insights = generate_natural_language_insights(df, variables=[var])
            print("\n📝 Natural Language Insights:")
            for line in nl_insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")


# --- RUN LOOP ---
conversational_loop_nl()


📑 Catalog loaded: (36, 10)
✅ Inserted 36 metadata entries into ChromaDB.
🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.


🔹 Processing query: salinity in bay of bengal 2023
📂 Loading 2 matching files...



📝 Natural Language Insights:
Highest psal 131.75 at (2.17, 56.63), depth nanm.
The average psal is 34.42, ranging 0.00–131.75.
Lowest psal 0.00 at (-32.50, 39.64), depth 0.00m.
No strong trend of PSAL with depth.
Coverage: lat -39.98–24.64, lon 30.16–109.94, depth -0.10–5331.60m.

✅ Query processed. Enter another query or 'exit'.


🔹 Processing query: depth 2023
📂 Loading 1 matching files...



📝 Natural Language Insights:
Coverage: lat -39.98–24.64, lon 30.18–109.94, depth 0.00–5005.10m.
The average temp is 9.81, ranging 0.00–58.63.
Highest temp 58.63 at (-11.67, 57.06), depth 41.20m.
No strong trend of TEMP with depth.
Lowest temp 0.00 at (-32.50, 39.64), depth 0.00m.

✅ Query processed. Enter another query or 'exit'.


🔹 Processing query: compare salinity at surface and 1000m depth 2023
📂 Loading 3 matching files...



📝 Natural Language Insights:
No strong trend of PSAL with depth.
Lowest psal 0.00 at (-33.20, 38.02), depth 0.00m.
Coverage: lat -39.98–25.40, lon 30.01–109.94, depth -0.20–5331.60m.
Highest psal 131.75 at (2.17, 56.63), depth nanm.
The average psal is 34.43, ranging 0.00–131.75.

✅ Query processed. Enter another query or 'exit'.


🔹 Processing query: temp vs salinity
📂 Loading 5 matching files...



📝 Natural Language Insights:
Lowest temp -4.10 at (-12.14, 44.42), depth nanm.
Highest temp 61.17 at (-5.54, 90.80), depth -437.10m.
No strong trend of TEMP with depth.
The average temp is 9.47, ranging -4.10–61.17.
Coverage: lat -39.99–26.76, lon 30.04–109.89, depth -437.10–5506.00m.

✅ Query processed. Enter another query or 'exit'.


🔹 Processing query: temp
📂 Loading 5 matching files...



📝 Natural Language Insights:
Highest temp 61.17 at (-5.54, 90.80), depth -437.10m.
Lowest temp -4.10 at (-12.14, 44.42), depth nanm.
Coverage: lat -39.99–26.76, lon 30.03–109.93, depth -437.10–5506.00m.
The average temp is 9.59, ranging -4.10–61.17.
No strong trend of TEMP with depth.

✅ Query processed. Enter another query or 'exit'.

You: exit


In [None]:
import os
import random
import numpy as np
import pandas as pd
import xarray as xr
import plotly.express as px
from sentence_transformers import SentenceTransformer
import chromadb

# --- STEP 1: SETUP PATHS ---
drive_base = "/content/drive/MyDrive/ColabNotebooks/SIH2025/Data"
filtered_dir = os.path.join(drive_base, "filtered_argo_data")
os.makedirs(filtered_dir, exist_ok=True)

# --- STEP 2: LOAD CATALOG ---
catalog_path = os.path.join(drive_base, "argo_metadata_catalog.csv")
catalog = pd.read_csv(catalog_path)
print("📑 Catalog loaded:", catalog.shape)

# --- STEP 3: INITIALIZE VECTOR STORE ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client()
collection_name = "argo_metadata"

try:
    client.delete_collection(name=collection_name)
except:
    pass
collection = client.create_collection(name=collection_name)

# --- STEP 4: INSERT METADATA EMBEDDINGS ---
docs, metas = [], []
for idx, row in catalog.iterrows():
    text = (
        f"File {row['file_path']} contains {row['num_rows']} measurements "
        f"from {row['year']}-{row['month']} "
        f"in region lat[{row['lat_min']}, {row['lat_max']}] "
        f"lon[{row['lon_min']}, {row['lon_max']}] "
        f"with depth range {row['depth_min']}–{row['depth_max']} meters."
    )
    docs.append(text)
    metas.append(row.to_dict())

embeddings = embedder.encode(docs).tolist()
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metas,
    ids=[str(i) for i in range(len(docs))]
)
print(f"✅ Inserted {len(docs)} metadata entries into ChromaDB.")


# --- STEP 5: HYBRID RETRIEVE FUNCTION ---
def hybrid_retrieve(query_text, year=None, lat_min=None, lat_max=None,
                    lon_min=None, lon_max=None, depth_min=None, depth_max=None, top_k=5):
    query_emb = embedder.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_emb, n_results=top_k)

    if not results['metadatas'] or len(results['metadatas'][0]) == 0:
        return pd.DataFrame()

    matched_files = pd.DataFrame(results['metadatas'][0])

    # Apply filters safely
    if 'year' in matched_files.columns and year is not None:
        matched_files = matched_files[matched_files['year'] == year]
    if 'lat_max' in matched_files.columns and lat_min is not None:
        matched_files = matched_files[matched_files['lat_max'] >= lat_min]
    if 'lat_min' in matched_files.columns and lat_max is not None:
        matched_files = matched_files[matched_files['lat_min'] <= lat_max]
    if 'lon_max' in matched_files.columns and lon_min is not None:
        matched_files = matched_files[matched_files['lon_max'] >= lon_min]
    if 'lon_min' in matched_files.columns and lon_max is not None:
        matched_files = matched_files[matched_files['lon_min'] <= lon_max]
    if 'depth_max' in matched_files.columns and depth_min is not None:
        matched_files = matched_files[matched_files['depth_max'] >= depth_min]
    if 'depth_min' in matched_files.columns and depth_max is not None:
        matched_files = matched_files[matched_files['depth_min'] <= depth_max]

    return matched_files


# --- STEP 6: LAZY DATA LOADER ---
def load_file_lazy(file_path):
    try:
        if file_path.endswith(".nc"):
            ds = xr.open_dataset(file_path, decode_times=False)
            df = ds.to_dataframe().reset_index()
        elif file_path.endswith(".parquet"):
            df = pd.read_parquet(file_path)
        else:
            return pd.DataFrame()
        df["source_file"] = file_path
        return df
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return pd.DataFrame()


# --- STEP 7: GET DATA FUNCTION ---
def get_data_for_query(query_text, year=None, lat_min=None, lat_max=None,
                      lon_min=None, lon_max=None, depth_min=None, depth_max=None,
                      var="TEMP", max_rows=5000):
    files = hybrid_retrieve(query_text, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max)
    if files.empty:
        print("⚠️ No matching files found for this query")
        return pd.DataFrame()

    print(f"📂 Loading {len(files)} matching files...")
    all_data = []
    for f in files['file_path']:
        if os.path.exists(f):
            df = load_file_lazy(f)
            if not df.empty:
                all_data.append(df)
    if not all_data:
        print("⚠️ No data loaded")
        return pd.DataFrame()

    df = pd.concat(all_data, ignore_index=True)
    return df


# --- STEP 8: PLOT FUNCTION ---
def plot_variable(df, var="TEMP", depth_col="DEPTH_M", interactive=True):
    if var in df.columns and depth_col in df.columns:
        fig = px.scatter(
            df, x="LONGITUDE", y=depth_col, color=var,
            color_continuous_scale="Viridis", height=600,
            title=f"{var} vs {depth_col}"
        )
        fig.update_yaxes(autorange="reversed")
        if interactive:
            fig.show()


# --- STEP 9: NATURAL LANGUAGE INSIGHTS ---
def generate_natural_language_insights(df, variables=["TEMP", "PSAL"], depth_col="DEPTH_M", lat_col="LATITUDE", lon_col="LONGITUDE"):
    if df.empty:
        return ["⚠️ No data available for insights."]

    insights = []
    for var in variables:
        if var not in df.columns:
            continue

        val_mean = df[var].mean()
        val_min = df[var].min()
        val_max = df[var].max()
        val_std = df[var].std()

        insights.append(f"The average {var.lower()} is {val_mean:.2f}, ranging {val_min:.2f}–{val_max:.2f}.")

        if depth_col in df.columns:
            corr = np.corrcoef(df[depth_col], df[var])[0,1]
            if corr > 0.1:
                insights.append(f"{var} increases with depth → deeper waters higher in {var.lower()}.")
            elif corr < -0.1:
                insights.append(f"{var} decreases with depth → deeper waters lower in {var.lower()}.")
            else:
                insights.append(f"No strong trend of {var} with depth.")

        if lat_col in df.columns and lon_col in df.columns:
            max_row = df.loc[df[var].idxmax()]
            min_row = df.loc[df[var].idxmin()]
            insights.append(f"Highest {var.lower()} {max_row[var]:.2f} at ({max_row[lat_col]:.2f}, {max_row[lon_col]:.2f}), depth {max_row[depth_col]:.2f}m.")
            insights.append(f"Lowest {var.lower()} {min_row[var]:.2f} at ({min_row[lat_col]:.2f}, {min_row[lon_col]:.2f}), depth {min_row[depth_col]:.2f}m.")

    insights.append(f"Coverage: lat {df[lat_col].min():.2f}–{df[lat_col].max():.2f}, lon {df[lon_col].min():.2f}–{df[lon_col].max():.2f}, depth {df[depth_col].min():.2f}–{df[depth_col].max():.2f}m.")
    random.shuffle(insights)
    return insights


# --- STEP 10: QUERY PARSER (SIMPLE) ---
def parse_query(user_input):
    var = "TEMP" if "temp" in user_input.lower() else "PSAL" if "sal" in user_input.lower() else "TEMP"
    year = None
    for tok in user_input.split():
        if tok.isdigit() and len(tok) == 4:
            year = int(tok)
    return var, year, None, None, None, None, None, None


# --- STEP 11: CONVERSATIONAL LOOP ---
def conversational_loop_nl():
    print("🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.\n")
    while True:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("👋 Exiting Argo Data Assistant.")
            break

        var, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max = parse_query(user_input)

        print(f"\n🔹 Processing query: {user_input}")
        df = get_data_for_query(user_input, year, lat_min, lat_max, lon_min, lon_max, depth_min, depth_max, var=var)

        if not df.empty:
            df_sample = df.sample(min(5000, len(df)), random_state=42)
            plot_variable(df_sample, var=var, depth_col="DEPTH_M", interactive=True)

            nl_insights = generate_natural_language_insights(df, variables=[var])
            print("\n📝 Natural Language Insights:")
            for line in nl_insights:
                print(line)
            print("\n✅ Query processed. Enter another query or 'exit'.\n")
        else:
            print("⚠️ No data available for this query.\n")


# --- RUN LOOP ---
conversational_loop_nl()


📑 Catalog loaded: (36, 10)
✅ Inserted 36 metadata entries into ChromaDB.
🌊 Argo Data Assistant (Accurate Viz + Natural Language Insights) ready! Type 'exit' to quit.


🔹 Processing query: salinity 2022
📂 Loading 2 matching files...



📝 Natural Language Insights:
Highest psal 92.34 at (-8.66, 102.19), depth 221.93m.
Coverage: lat -39.95–25.10, lon 30.39–109.97, depth -437.20–4598.30m.
Lowest psal 0.00 at (-9.07, 102.19), depth 51.70m.
The average psal is 34.26, ranging 0.00–92.34.
No strong trend of PSAL with depth.

✅ Query processed. Enter another query or 'exit'.

You: exit
