In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

---

## Prepare gauge metadata info for Caravanification

In [None]:
path_to_kgz_basins = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/basin_outline/kyrgyzstan/HRU_KRG_ML_MODEL_BASINS_1d.shp"
path_to_tjik_basins = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/basin_outline/tajikistan/HRU_TAJIK_ML_MODEL_BASINS_1d.shp"

kgz_basins = gpd.read_file(path_to_kgz_basins)
tjik_basins = gpd.read_file(path_to_tjik_basins)

# Print the crs
print(kgz_basins.crs)
print(tjik_basins.crs)

# To ESPG 4326
kgz_basins = kgz_basins.to_crs(epsg=4326)
tjik_basins = tjik_basins.to_crs(epsg=4326)

# Plot one on top of the other
fig, ax = plt.subplots(figsize=(10, 5))
kgz_basins.plot(ax=ax, color="purple", edgecolor="black", alpha=0.5)
tjik_basins.plot(ax=ax, color="blue", edgecolor="black", alpha=0.5)
plt.show()

In [None]:
path_to_pgkg = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/CA-discharge.gpkg"

gpkg = gpd.read_file(path_to_pgkg)

# Create lat/lon mapping dictionaries
lat_dict = dict(zip(gpkg["CODE"], gpkg["LAT"]))
lon_dict = dict(zip(gpkg["CODE"], gpkg["LON"]))
country_dict = dict(zip(gpkg["CODE"], gpkg["COUNTRY"]))

# Add lat/lon columns to basin dataframes
kgz_basins["LAT"] = kgz_basins["CODE"].map(lat_dict)
kgz_basins["LON"] = kgz_basins["CODE"].map(lon_dict)
kgz_basins["COUNTRY"] = kgz_basins["CODE"].map(country_dict)

tjik_basins["LAT"] = tjik_basins["CODE"].map(lat_dict)
tjik_basins["LON"] = tjik_basins["CODE"].map(lon_dict)
tjik_basins["COUNTRY"] = tjik_basins["CODE"].map(country_dict)

column_rename = {
    "CODE": "gauge_id",
    "LAT": "gauge_lat",
    "LON": "gauge_lon",
    "COUNTRY": "country",
}

country_mapping = {
    "KYG": "Kyrgyzstan",
    "TAJ": "Tajikistan",
}

kgz_basins = kgz_basins.rename(columns=column_rename)
tjik_basins = tjik_basins.rename(columns=column_rename)

kgz_basins["country"] = kgz_basins["country"].map(country_mapping)
tjik_basins["country"] = tjik_basins["country"].map(country_mapping)

# Create gauge_name from gauge_id and country
kgz_basins["gauge_name"] = (
    kgz_basins["gauge_id"].astype(str) + "_" + kgz_basins["country"]
)
tjik_basins["gauge_name"] = (
    tjik_basins["gauge_id"].astype(str) + "_" + tjik_basins["country"]
)

columns_to_keep = ["gauge_id", "gauge_lat", "gauge_lon", "country", "gauge_name"]
kgz_basins = kgz_basins[columns_to_keep]
tjik_basins = tjik_basins[columns_to_keep]

# Merge the two geo dataframes
merged_basins = pd.concat([kgz_basins, tjik_basins])
merged_basins = merged_basins.dropna()

# Set index to gauge_id
merged_basins = merged_basins.set_index("gauge_id")

In [None]:
merged_basins.to_csv(
    "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/CA_gauge_metadata_info.csv"
)

## Prepare streamflow data for Caravanification

In [2]:
path_to_kgz_streamflow = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/discharge/KYRGYZSTAN_streamflow.csv"
path_to_tjik_streamflow = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/discharge/TAJIKISTAN_streamflow.csv"

kgz_streamflow = pd.read_csv(path_to_kgz_streamflow)
tjik_streamflow = pd.read_csv(path_to_tjik_streamflow)

In [3]:
def convert_discharge_to_mm_per_day(discharge: float, area: float) -> float:
    """Takes discharge in m3/d and converts to mm/d

    Parameters:
        discharge (float): discharge in m3/d
        area (float): area in km2

    Returns:
        float: discharge in mm/d
    """

    return discharge / (area * 1000)


path_to_kgz_basins = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/basin_outline/kyrgyzstan/HRU_KRG_ML_MODEL_BASINS_1d.shp"
path_to_tjik_basins = "/Users/cooper/Desktop/CAMELS-CH/data/CA_raw/basin_outline/tajikistan/HRU_TAJIK_ML_MODEL_BASINS_1d.shp"

kgz_basins = gpd.read_file(path_to_kgz_basins)
tjik_basins = gpd.read_file(path_to_tjik_basins)

In [4]:
tjik_basins

Unnamed: 0,CODE,name,Z,geometry
0,16205,16205,0,"POLYGON ((70.58154 39.90595, 70.58505 39.90591..."
1,17050,17050,0,"POLYGON ((73.76716 38.00054, 73.77056 38.0004,..."
2,17077,17077,0,"POLYGON ((70.36145 38.65806, 70.3626 38.65805,..."
3,17082,17082,0,"POLYGON ((72.87466 39.86039, 72.87461 39.85949..."
4,17100,17100,0,"POLYGON ((72.87466 39.86039, 72.87461 39.85949..."
5,17110,17110,0,"POLYGON ((71.31539 39.12275, 71.31655 39.12272..."
6,17137,17137,0,"POLYGON ((69.48088 39.20454, 69.48204 39.20453..."
7,17147,17147,0,"POLYGON ((68.79094 39.09, 68.79326 39.09, 68.7..."
8,17150,17150,0,"POLYGON ((68.79094 39.09, 68.79326 39.09, 68.7..."
9,17202,17202,0,"POLYGON ((68.51999 38.97566, 68.52345 38.97567..."


In [5]:
kgz_streamflow

Unnamed: 0.1,Unnamed: 0,date,discharge,code,river,NAME_ENG
0,0,2000-01-02,1.9,15013,р.Джыргалан-с.Советское,Dzhyrgalan - Soviet
1,1,2000-01-03,1.9,15013,р.Джыргалан-с.Советское,Dzhyrgalan - Soviet
2,2,2000-01-04,1.9,15013,р.Джыргалан-с.Советское,Dzhyrgalan - Soviet
3,3,2000-01-05,1.9,15013,р.Джыргалан-с.Советское,Dzhyrgalan - Soviet
4,4,2000-01-06,1.9,15013,р.Джыргалан-с.Советское,Dzhyrgalan - Soviet
...,...,...,...,...,...,...
551764,551764,2023-10-26,221.0,16936,Toktogul Inflow,Inflow to Toktogul reservoir
551765,551765,2023-10-27,249.0,16936,Toktogul Inflow,Inflow to Toktogul reservoir
551766,551766,2023-10-28,257.0,16936,Toktogul Inflow,Inflow to Toktogul reservoir
551767,551767,2023-10-29,184.0,16936,Toktogul Inflow,Inflow to Toktogul reservoir


In [None]:
def process_discharge_data(streamflow_df, basins_gdf):
    # Convert to projected CRS (e.g. UTM) for accurate area calculation
    basins_proj = basins_gdf.to_crs(epsg=32642)  # UTM zone 42N for Central Asia

    basins_df = basins_proj[["CODE", "geometry"]].copy()
    basins_df["CODE"] = basins_df["CODE"].astype(int)
    basins_df["area_km2"] = basins_df.geometry.area / 1e6

    print(f"Min area: {min(basins_df['area_km2'])} km²")
    print(f"Max area: {max(basins_df['area_km2'])} km²")

    streamflow_df["code"] = streamflow_df["code"].astype(int)
    df = streamflow_df.merge(
        basins_df[["CODE", "area_km2"]], left_on="code", right_on="CODE"
    )

    df["discharge_spec"] = df.apply(
        lambda x: convert_discharge_to_mm_per_day(
            x["discharge"] * 86400, x["area_km2"]
        ),
        axis=1,
    )

    df["gauge_id"] = df["code"].astype(str)

    return df[["date", "gauge_id", "discharge", "discharge_spec", "NAME_ENG"]]


# Process each country
kgz_processed = process_discharge_data(kgz_streamflow, kgz_basins)
tjik_processed = process_discharge_data(tjik_streamflow, tjik_basins)

# Merge the two dataframes
merged_processed = pd.concat([kgz_processed, tjik_processed])

Min area: 73.72999999999602 km²
Max area: 52409.75000000001 km²
Min area: 336.75999999999414 km²
Max area: 29230.200000000044 km²


In [11]:
merged_processed

Unnamed: 0,date,gauge_id,discharge,discharge_spec,NAME_ENG
0,2000-01-02,15013,1.9,0.633162,Dzhyrgalan - Soviet
1,2000-01-03,15013,1.9,0.633162,Dzhyrgalan - Soviet
2,2000-01-04,15013,1.9,0.633162,Dzhyrgalan - Soviet
3,2000-01-05,15013,1.9,0.633162,Dzhyrgalan - Soviet
4,2000-01-06,15013,1.9,0.633162,Dzhyrgalan - Soviet
...,...,...,...,...,...
449030,2023-12-27,17453,20.1,0.875433,Vanch Bichkharv
449031,2023-12-28,17453,19.8,0.862367,Vanch Bichkharv
449032,2023-12-29,17453,19.8,0.862367,Vanch Bichkharv
449033,2023-12-30,17453,19.8,0.862367,Vanch Bichkharv
