In [12]:
import os
from pyhdf.SD import SD, SDC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import label
from datetime import datetime, timedelta
from pathlib import Path

/Users/fadiya


In [4]:
cd "/Users/fadiya/Documents/cycone/Data/downloads"

/Users/fadiya/Documents/cycone/data/downloads


In [13]:
THRESH_BT = 233
MIN_PIXELS = 600

In [15]:
def read_modis_hdf(hdf_files):
    modis_data = []

    for file in hdf_files:
        try:
            hdf = SD(file, SDC.READ)

            latitude = hdf.select('Latitude')[:]
            longitude = hdf.select('Longitude')[:]
            brightness_temperature = hdf.select('Brightness_Temperature')[:]
            scan_start_time = hdf.select('Scan_Start_Time')[:]

            base_time = datetime(1993, 1, 1)
            time = [base_time + timedelta(seconds=t) for t in scan_start_time.flatten()]
            modis_data.append({
                'latitude': latitude,
                'longitude':longitude,
                'brightness_temperature': brightness_temperature,
                'time': time
            })
        except Exception as e:
            print(f"Error processing file {file}:{e}")

    return modis_data
    

In [17]:
def detect_mcs(df, threshold=THRESH_BT, min_pixels=MIN_PIXELS):
    mcs_mask = df['brightness_temperature'] <= threshold
    labeled, num_features = label(mcs_mask)

    mcs_area = np.array([np.sum(labeled == i) for i in range(1, num_features + 1)])
    valid_mcs_mask = mcs_area >= min_pixels

    valid_mcs = df[labeled > 0].loc[valid_mcs_mask]
    return valid_mcs
    

In [18]:
def filter_mcs_by_date(df, start_date, end_date):
    return df[(df['time'] >= start_date) & (df['time'] <= end_date)]
    

In [30]:
def process_granules(granules, start_date, end_date, output_csv):
    records = []

    for f in granules:
        fn = Path(f).name
        parts = fn.split(".")
        year = int(parts[1][1:5])
        doy = int(parts[1][5:])
        hhmm = parts[2]
        hour = int(hhmm[:2])
        minute = int(hhmm[2:])

        date0=datetime(year, 1, 1) + timedelta(days=doy-1, hours=hour, minutes=minute)

        try:
            sd= SD(f, SDC.READ)
            ctt_raw = sd.select('Latitude')[:].astype(float)
            lon = sd.select('Longitude')[:]
            attrs = sd.select('Longitude')[:]
            attrs = sd.select('Cloud_Top_Temperature').attributes()
            ctt = (ctt_raw - attrs['add_offset'])*attrs['scale_factor']
            ctt = np.ma.masked_where(ctt_raw == attrs['_FillValue'], ctt)

            lon_wrapped = (lon + 180.)%360-180.
            cold = ctt<=THRESH_BT
            blobs, nlab = label(cold)

            for lab in range(1, nlab + 1):
                mask=blobs==lab
                if mask.sum()<MIN_PIXELS:
                    continue 
                clay = float(lat[mask].mean())
                clon = float(np.degrees(np.arctan2(np.sin(np.radians(lon_wrapped[mask])).mean(),
                                                   np.cos(np.radians(lon_wrapped[mask])).mean())))
                records.append({
                    "time":date0,
                    "granule": fn,
                    "lon": clon,
                    "lat": clat,
                    "MCS_minBT": float(ctt[mask].min()),
                    "MCS_size": int(mask.sum()),
                })
        except Exception as e:
            print(f"Error processing granule {f}:{e}")

    df = pd.DataFrame(records)
    df.to_csv(output_csv, index=False)
    print(f"Wrote {len(df)} rows to {output_csv}")
    return df

        

In [58]:
def find_granules_in_directory(start_date, end_date, granules_dir):
    granules = []
    
    # Convert the start and end date to datetime objects
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    
    # Loop through the date range
    current_dt = start_dt
    while current_dt <= end_dt:
        year = current_dt.year
        doy = current_dt.timetuple().tm_yday  # Day of the year (1-366)
        
        # Check for granules using regex pattern matching
        for hour in [0, 3, 6, 9, 12, 15, 18, 21]:  # 3-hour intervals
            for minute in [0, 15, 30, 45]:  # 15-minute intervals
                # Generate the base filename pattern (excluding the unique part)
                base_filename = f"MYD06_L2.A{year}{str(doy).zfill(3)}.{str(hour).zfill(2)}{str(minute).zfill(2)}.061"
                
                # List all files in the granules directory
                for filename in os.listdir(granules_dir):
                    # Check if the file starts with the base pattern (ignoring the unique code at the end)
                    if filename.startswith(base_filename) and filename.endswith('.hdf'):
                        file_path = os.path.join(granules_dir, filename)
                        if os.path.exists(file_path):  # Verify the file exists
                            granules.append(file_path)
                        else:
                            print(f"File not found: {file_path}")  # Optional: log missing files
        current_dt += timedelta(days=1)  # Move to the next day
    
    return granules

In [59]:
granules_dir = "/Users/fadiya/Documents/cycone/data/downloads"
start_date = "2002-09-01"
end_date = "2002-09-04"

In [61]:
granules = find_granules_in_directory(start_date, end_date, granules_dir)
print(f"Found {len(granules)} granules.")
for granule in granules:
    print(granule)

Found 3 granules.
/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002244.0045.061.2018004074116.hdf
/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002245.0300.061.2018004081337.hdf
/Users/fadiya/Documents/cycone/data/downloads/MYD06_L2.A2002245.0630.061.2018004075900.hdf


In [64]:
import os
import re
from datetime import datetime, timedelta

def find_granules_in_directory(start_date, end_date, granules_dir):
    granules = []
    
    # Convert the start and end date to datetime objects
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    
    # Loop through the date range
    current_dt = start_dt
    while current_dt <= end_dt:
        year = current_dt.year
        doy = current_dt.timetuple().tm_yday  # Day of the year (1-366)
        
        # Check for granules using regex pattern matching
        for hour in [0, 3, 6, 9, 12, 15, 18, 21]:  # 3-hour intervals
            for minute in [0, 15, 30, 45]:  # 15-minute intervals
                # Generate the base filename pattern (excluding the unique part)
                base_filename = f"MYD06_L2.A{year}{str(doy).zfill(3)}.{str(hour).zfill(2)}{str(minute).zfill(2)}.061"
                
                # List all files in the granules directory
                print(f"Looking for files starting with: {base_filename}")  # Debugging line
                
                # Loop through all files in the granules directory
                for filename in os.listdir(granules_dir):
                    # Debugging: print all filenames
                    print(f"Checking file: {filename}")  # Debugging line
                    
                    # Check if the file starts with the base pattern (ignoring the unique code at the end)
                    if filename.startswith(base_filename) and filename.endswith('.hdf'):
                        file_path = os.path.join(granules_dir, filename)
                        if os.path.exists(file_path):  # Verify the file exists
                            granules.append(file_path)
                        else:
                            print(f"File not found: {file_path}")  # Optional: log missing files
        current_dt += timedelta(days=1)  # Move to the next day
    
    return granules

granules_dir = "/Users/fadiya/Documents/cycone/data/downloads"
start_date = "2002-09-01"
end_date = "2002-09-04"

# Call the function and print the result
granules = find_granules_in_directory(start_date, end_date, granules_dir)
print(f"Found {len(granules)} granules.")
for granule in granules:
    print(granule)


Looking for files starting with: MYD06_L2.A2002244.0000.061
Checking file: MYD06_L2.A2002245.0440.061.2018004075606.hdf
Checking file: mcs_labeled.csv
Checking file: MCS CATALOGUE WITHOUT LABEL.xlsx
Checking file: mcs_with_environment.csv
Checking file: mcs_full.csv
Checking file: .DS_Store
Checking file: MYD06_L2.A2002247.0250.061.2018004083644.hdf
Checking file: mcs_labeled5D.csv
Checking file: MYD06_L2.A2002244.0400.061.2018004075305.hdf
Checking file: MYD06_L2.A2002245.0300.061.2018004081337.hdf
Checking file: MYD06_L2.A2002245.0125.061.2018004080039.hdf
Checking file: mcs_labeled10D.csv
Checking file: mcs_labeled_6H5D.csv
Checking file: Book1.xlsx
Checking file: MYD06_L2.A2002245.0630.061.2018004075900.hdf
Checking file: MYD06_L2.A2002245.0620.061.2018004080044.hdf
Checking file: MYD06_L2.A2002247.0745.061.2018004083222.hdf
Checking file: MYD06_L2.A2002246.0035.061.2018004081156.hdf
Checking file: MYD06_L2.A2002244.0220.061.2018004075640.hdf
Checking file: MYD06_L2.A2002244.0045.0