# 01 — Compare Datasets

Load Vizag, NOAA, and PSMSL CSVs and plot normalized time series comparison.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

In [None]:
# Load Vizag (Visakhapatnam tidal)
vizag_path = PROJECT_ROOT / "Visakhapatnam_UTide_full2024_hourly_IST.csv"
vizag = None
if vizag_path.exists():
    vizag = pd.read_csv(vizag_path)
    vizag['time'] = pd.to_datetime(vizag['Time(IST)'])
    vizag['value'] = vizag['prs(m)']
    vizag = vizag[['time', 'value']].dropna().set_index('time').sort_index()
    vizag['norm'] = (vizag['value'] - vizag['value'].min()) / (vizag['value'].max() - vizag['value'].min() + 1e-9)
    print(f"Vizag: {len(vizag)} points")
else:
    print("Vizag file not found")

# Load NOAA (San Diego)
noaa_dir = PROJECT_ROOT / "data" / "noaa" / "sandiego"
noaa = None
if noaa_dir.exists():
    dfs = []
    for f in sorted(noaa_dir.glob('2024_*.csv')):
        df = pd.read_csv(f)
        df['time'] = pd.to_datetime(df['Date Time'])
        df['value'] = pd.to_numeric(df[' Water Level'].astype(str).str.replace(',',''), errors='coerce')
        dfs.append(df[['time','value']].dropna())
    if dfs:
        noaa = pd.concat(dfs).drop_duplicates('time').set_index('time').sort_index()
        noaa['norm'] = (noaa['value'] - noaa['value'].min()) / (noaa['value'].max() - noaa['value'].min() + 1e-9)
        print(f"NOAA: {len(noaa)} points")
else:
    print("NOAA dir not found")

# Load PSMSL (if available)
psmsl_path = PROJECT_ROOT / "data" / "psmsl" / "rlr_monthly.zip"
psmsl = None
if psmsl_path.exists():
    try:
        psmsl = pd.read_csv(psmsl_path)
        if 'time' in psmsl.columns or len(psmsl.columns) >= 2:
            tcol = psmsl.columns[0] if 'time' not in psmsl.columns else 'time'
            vcol = [c for c in psmsl.columns if c != tcol][0]
            psmsl = psmsl.rename(columns={tcol:'time', vcol:'value'})
            psmsl['time'] = pd.to_datetime(psmsl['time'], errors='coerce')
            psmsl = psmsl.dropna().set_index('time').sort_index()
            psmsl['norm'] = (psmsl['value'] - psmsl['value'].min()) / (psmsl['value'].max() - psmsl['value'].min() + 1e-9)
            print(f"PSMSL: {len(psmsl)} points")
    except Exception as e:
        print(f"PSMSL load failed: {e}")
else:
    print("PSMSL file not found or empty")

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

if vizag is not None:
    ax.plot(vizag.index, vizag['norm'], label='Vizag (tidal prs)', alpha=0.8)
if noaa is not None:
    ax.plot(noaa.index, noaa['norm'], label='NOAA San Diego (water level)', alpha=0.8)
if psmsl is not None:
    ax.plot(psmsl.index, psmsl['norm'], label='PSMSL', alpha=0.8)

ax.set_xlabel('Time')
ax.set_ylabel('Normalized value (0–1)')
ax.set_title('Normalized Time Series Comparison: Vizag, NOAA, PSMSL')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / '01_compare_datasets.png', dpi=150)
plt.show()