In [28]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from scipy.ndimage import label
from pathlib import Path
from pyhdf.SD import SD, SDC

In [2]:
cd "/Users/fadiya/Documents/cycone/Data/downloads"

/Users/fadiya/Documents/cycone/data/downloads


In [3]:
THRESH_BT = 233
MIN_PIXELS = 600
TIME_TOL = pd.Timedelta("6H")
DIST_TOL = 10.0

  TIME_TOL = pd.Timedelta("6H")


In [26]:
def load_clean_ibtracs(file_path, start_date, end_date):
    ib=pd.read_csv(file_path, usecols=["SID", "ISO_TIME", "LAT", "LON"], low_memory=False)

    ib['ISO_TIME']=pd.to_datetime(ib['ISO_TIME'], errors='coerce')
    ib['LAT']=pd.to_numeric(ib['LAT'], errors='coerce')
    ib['LON']=pd.to_numeric(ib['LON'], errors='coerce')

    mask_time = (ib['ISO_TIME'] >= start_date)&(ib['ISO_TIME'] <= end_date)
    mask_region = ib['LAT'].between(0,30) & ib['LON'].between(100, 180)

    ib_filtered = ib.loc[mask_time & mask_region]

    gen = ib_filtered.sort_values("ISO_TIME").groupby("SID", as_index=False).first()[["SID", "ISO_TIME", "LAT", "LON"]]
    gen = gen.rename(columns={"ISO_TIME":"gen_time","LAT":"gen_lat","LON":"gen_lon"})

    return gen


In [17]:
def read_modis_granules(granules, gen):
    records = []
    
    for f in granules:
        fn = Path(f).name
        parts = fn.split(".")
        year = int(parts[1][1:5])
        doy = int(parts[1][5:])
        hhmm = parts[2]
        hour = int(hhmm[:2])
        minute = int(hhmm[2:])
        
        # Build datetime object from granule's year, day of year, hour, and minute
        date0 = datetime(year, 1, 1) + timedelta(days=doy-1, hours=hour, minutes=minute)

        try:
            # Open the granule (MODIS file)
            sd = SD(f, SDC.READ)
            ctt_raw = sd.select('Cloud_Top_Temperature')[:].astype(float)
            lat = sd.select('Latitude')[:]
            lon = sd.select('Longitude')[:]
            attrs = sd.select('Cloud_Top_Temperature').attributes()
            ctt = (ctt_raw - attrs['add_offset']) * attrs['scale_factor']
            ctt = np.ma.masked_where(ctt_raw == attrs['_FillValue'], ctt)  # Mask invalid values

            # Skip invalid granules
            if np.all(np.isnan(ctt)):
                print(f"Skipping granule {f} due to invalid data.")
                continue

            # Wrap longitude to [-180, 180]
            lon_wrapped = (lon + 180.) % 360. - 180.
            cold = ctt <= THRESH_BT
            blobs, nlab = label(cold)

            # Process detected MCS blobs
            for lab in range(1, nlab + 1):
                mask = blobs == lab
                if mask.sum() < MIN_PIXELS:
                    continue  # Skip blobs smaller than the minimum size requirement

                # Calculate average latitude and longitude for the blob
                clat = float(lat[mask].mean())
                clon = float(np.degrees(np.arctan2(np.sin(np.radians(lon_wrapped[mask])).mean(),
                                                   np.cos(np.radians(lon_wrapped[mask])).mean())))

                # Append MCS data
                records.append({
                    "time": date0,
                    "granule": fn,
                    "lon": clon,
                    "lat": clat,
                    "MCS_minBT": float(ctt[mask].min()),
                    "MCS_avgBT": float(ctt[mask].mean()),
                    "MCS_size": int(mask.sum()),
                })
        except Exception as e:
            print(f"Error processing granule {f}: {e}")
    
    return records


In [18]:
def label_mcs_with_ibtracs(df, gen):
    def label_mcs(row):
        t0, lat0, lon0 = row["time"],row["lat"], row["lon"]
        ok = (gen['gen_time'] >= t0-TIME_TOL)&(gen['gen_time']<= t0 + TIME_TOL)
        cand = gen.loc[ok]

        if cand.empty:
            return 0

        dlat = (cand['gen_lat'] - lat0).abs()
        dlon = ((cand['gen_lon'] - lon0 + 180) % 360-180).abs()
        return int (((dlat <= DIST_TOL) & (dlon <= DIST_TOL)).any())

    df["label"] = df.apply(label_mcs, axis=1)
    return df

In [31]:
def merge_and_output_data(mcs_df, gen, output_csv):
    mcs_labeled = label_mcs_with_ibtracs(mcs_df, gen)

    mcs_labeled.to_csv(output_csv, index=False)
    print(f"Wrote {len(mcs_labeled)} rows to {output_csv}")
    return mcs_labeled

In [32]:
start_date = "2002-09-01"
end_date = "2002-09-04"

gen = load_clean_ibtracs("ibtracs.ALL.list.v04r01.csv", start_date, end_date)
granules = [
    '/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002244.0045.061.2018004074116.hdf',
    '/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002244.0220.061.2018004075640.hdf',
    '/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002244.0400.061.2018004075305.hdf',
    '/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002244.0540.061.2018004074751.hdf'
]

mcs_records = read_modis_granules(granules, gen)
mcs_df = pd.DataFrame(mcs_records)

output_csv= "mcs_labeled_6H10D_test.csv"
mcs_labeled = merge_and_output_data(mcs_df, gen, output_csv)

print(mcs_labeled.head())

  ib['ISO_TIME']=pd.to_datetime(ib['ISO_TIME'], errors='coerce')
  "MCS_minBT": float(ctt[mask].min()),
  "MCS_avgBT": float(ctt[mask].mean()),


Wrote 26 rows to mcs_labeled_6H10D_test.csv
                 time                                       granule  \
0 2002-09-01 00:45:00  MYD06_L2.A2002244.0045.061.2018004074116.hdf   
1 2002-09-01 00:45:00  MYD06_L2.A2002244.0045.061.2018004074116.hdf   
2 2002-09-01 00:45:00  MYD06_L2.A2002244.0045.061.2018004074116.hdf   
3 2002-09-01 00:45:00  MYD06_L2.A2002244.0045.061.2018004074116.hdf   
4 2002-09-01 00:45:00  MYD06_L2.A2002244.0045.061.2018004074116.hdf   

          lon        lat   MCS_minBT   MCS_avgBT  MCS_size  label  
0 -166.064102  10.966345         NaN         NaN       670      0  
1 -172.369415  10.166039         NaN         NaN      1263      0  
2 -178.838989  11.009789  232.129995  232.129995      1042      1  
3 -165.777390  16.864054  198.789996  222.299237      3341      0  
4 -178.480469  20.104568  194.589996  217.285998     13885      1  
