In [None]:
import re
import pandas as pd
from pathlib import Path

ROOT_DIR    = Path(r"D:\dong\papers\Project 1\New for review\1SM\original insitu data\Berlin")
OUTPUT_ROOT = Path(r"D:\dong\papers\Project 1\New for review\1SM\insitu 1\Berlin")
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

DEPTH_STR_5CM = "0.050000"

def _first_data_row_index(fp):
    with open(fp, 'r', encoding='utf-8', errors='ignore') as f:
        for i, ln in enumerate(f):
            if re.match(r"\d{4}/\d{2}/\d{2}", ln):
                return i
    return None

def read_daily_sum(fp):
    skip = _first_data_row_index(fp)
    if skip is None:
        return pd.Series(name="daily_precip_mm", dtype=float)
    df = pd.read_csv(
        fp, delim_whitespace=True, header=None, skiprows=skip,
        usecols=[0,1,2], names=["date","time","value"],
        dtype={"date":str,"time":str,"value":float}, na_values=["","NA"]
    )
    df["dt"]  = pd.to_datetime(df["date"] + " " + df["time"], format="%Y/%m/%d %H:%M")
    df["day"] = df["dt"].dt.date
    s = df.groupby("day")["value"].sum()
    s.name = "daily_precip_mm"
    return s

def read_daily_mean(fp):
    skip = _first_data_row_index(fp)
    if skip is None:
        return pd.Series(dtype=float)
    df = pd.read_csv(
        fp, delim_whitespace=True, header=None, skiprows=skip,
        usecols=[0,1,2], names=["date","time","value"],
        dtype={"date":str,"time":str,"value":float}, na_values=["","NA"]
    )
    df["dt"]  = pd.to_datetime(df["date"] + " " + df["time"], format="%Y/%m/%d %H:%M")
    df["day"] = df["dt"].dt.date
    s = df.groupby("day")["value"].mean()
    return s

def read_midday_mean(fp, hours=(10, 11, 12)):
    skip = _first_data_row_index(fp)
    if skip is None:
        return pd.Series(dtype=float)
    df = pd.read_csv(
        fp, delim_whitespace=True, header=None, skiprows=skip,
        usecols=[0,1,2], names=["date","time","value"],
        dtype={"date":str,"time":str,"value":float}, na_values=["","NA"]
    )
    df["dt"]   = pd.to_datetime(df["date"] + " " + df["time"], format="%Y/%m/%d %H:%M")
    df["hour"] = df["dt"].dt.hour
    df = df[df["hour"].isin(hours)]
    if df.empty:
        return pd.Series(dtype=float)
    df["day"] = df["dt"].dt.date
    s = df.groupby("day")["value"].mean()
    return s

for site in ROOT_DIR.iterdir():
    if not site.is_dir():
        continue
    p_files = sorted(site.glob("*_p_*.stm"))
    precip = read_daily_sum(p_files[0]) if p_files else pd.Series(name="daily_precip_mm", dtype=float)
    daily = {}
    fp_sm5 = next(site.glob(f"*_sm_*_{DEPTH_STR_5CM}_*.stm"), None)
    if fp_sm5:
        s = read_midday_mean(fp_sm5, hours=(10, 11, 12))
        s.name = "sm_5cm_10_11_12_mean"
        daily[s.name] = s
    fp_ts5 = next(site.glob("*_ts_*_0.050000_*.stm"), None)
    if fp_ts5:
        s = read_daily_mean(fp_ts5)
        s.name = "ts_5cm"
        daily[s.name] = s
    if precip.empty and not daily:
        continue
    df = pd.DataFrame(daily)
    if not precip.empty:
        df = df.join(precip, how="outer")
    df.index.name = "date"
    df = df.reset_index().sort_values("date")
    out = OUTPUT_ROOT / site.name
    out.mkdir(parents=True, exist_ok=True)
    df.to_csv(out / "daily_summary.csv", index=False)

print("Finished")
