### Deep Learning for Computer Vision  
### Multi-Task Regression with the Digital Typhoon Dataset

This notebook demonstrates a **supervised multi-task regression** workflow for remote sensing using **TorchGeo** using the Digital Typhoon dataset, which consists of infrared (IR) satellite imagery of tropical cyclones paired with meteorological measurements.

The objective is to predict multiple continuous typhoon intensity variables from satellite imagery using a deep learning model.  

#### Dataset Overview
The [Digital Typhoon](https://torchgeo.readthedocs.io/en/stable/api/datasets.html#digital-typhoon) is derived from hourly infrared channel observations captured by multiple generations of the Himawari meteorological satellites, spanning the period from 1978 to the present. The satellite measurements have been converted to brightness temperatures and normalized across different sensors, resulting in a consistent spatio-temporal dataset covering more than four decades.  

**Dataset features:**
- Infrared (IR) satellite imagery of 512 × 512 pixels at ~5km resolution 
- Auxiliary metadata including wind speed, pressure and additional typhoon-related attributes  
- 1,099 typhoons and 189,364 images

**References**  
Digital Typhoon Dataset: *A Large-Scale Benchmark for Tropical Cyclone Analysis*      [arXiv:2411.16421](https://arxiv.org/pdf/2411.16421) ; [arXiv:2311.02665](https://arxiv.org/pdf/2311.02665)


In [23]:
## import libraries
import os
import shutil
import pandas as pd
import torch

from torch.utils.data import DataLoader
from torchgeo.datasets import DigitalTyphoon


In [24]:
# load dataset
root = "/home/ogallo/DL4CV/DigitalTyphoon"

dataset = DigitalTyphoon(
    root=root,
    features=["wind", "pressure"],
    targets=["wind", "pressure"],
    sequence_length=1,
    download=False
)


In [3]:
print(len(dataset))        # number of sequences
print(dataset[0])          # inspect the first sequence

173418
{'image': tensor([[[0.7248, 0.7813, 0.7813,  ..., 0.9331, 0.9363, 0.9331],
         [0.7248, 0.7576, 0.7735,  ..., 0.9331, 0.9331, 0.9331],
         [0.7290, 0.7536, 0.7656,  ..., 0.9299, 0.9299, 0.9331],
         ...,
         [0.6904, 0.6495, 0.6007,  ..., 0.8483, 0.8798, 0.8659],
         [0.6542, 0.6400, 0.6725,  ..., 0.8483, 0.8659, 0.8447],
         [0.7373, 0.7536, 0.7967,  ..., 0.8518, 0.8518, 0.8483]]]), 'wind': tensor(-1.1229), 'pressure': tensor(0.5422), 'label': tensor([-1.1229,  0.5422])}


In [4]:
aux_data = pd.read_csv("/home/ogallo/DL4CV/DigitalTyphoon/WP/aux_data.csv")
print(aux_data.head())     # inspect auxiliary data

       id                   image_path  year  month  day  hour  grade    lat  \
0  197830  1978120100-197830-GMS1-1.h5  1978     12    1     0      6  36.00   
1  197830  1978120103-197830-GMS1-1.h5  1978     12    1     3      6  37.46   
2  197830  1978120106-197830-GMS1-1.h5  1978     12    1     6      6  39.00   
3  197901  1978123112-197901-GMS1-1.h5  1978     12   31    12      2   2.00   
4  197901  1978123116-197901-GMS1-1.h5  1978     12   31    16      2   2.30   

      lng  pressure  wind  dir50  long50  short50  dir30  long30  short30  \
0  174.00     996.0   0.0      0       0        0      0       0        0   
1  176.44     994.0   0.0      0       0        0      0       0        0   
2  179.00     992.0   0.0      0       0        0      0       0        0   
3  172.00    1004.0   0.0      0       0        0      0       0        0   
4  171.81    1002.7   0.0      0       0        0      0       0        0   

   landfall  intp  
0         0     0  
1         0     

In [20]:
aux_data['year'].unique()

array([1978, 1979, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
       1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

Subset the dataset based on the typhoon grade, number of typhoons and lifecycle??

In [16]:
import numpy as np

root = "/home/ogallo/DL4CV/DigitalTyphoon/WP"
output_dir = "/home/ogallo/DL4CV/WP_sampled_50"
total_typhoons = 50
images_per_typhoon = 50  # ← Target images per typhoon 
np.random.seed(42)

# load auxiliary data
aux_path = os.path.join(root, "aux_data.csv")
df = pd.read_csv(aux_path)

print(f"\nOriginal dataset:")
print(f"  Total images: {len(df):,}")
print(f"  Total typhoons: {df['id'].nunique()}")
print(f"  Year range: {df['year'].min()} - {df['year'].max()}")
print(f"  Years available: {df['year'].nunique()}")

# typhoon summary statistics
typhoon_info = df.groupby('id').agg({
    'year': 'min',
    'grade': 'max'
}).reset_index().rename(columns={'year':'first_year', 'grade':'peak_grade'})


Original dataset:
  Total images: 189,364
  Total typhoons: 1099
  Year range: 1978 - 2022
  Years available: 44


In [17]:
# stratified sampling of typhoons by year
all_years = sorted(typhoon_info['first_year'].unique())
print(f"\nAvailable years: {len(all_years)} ({all_years[0]} - {all_years[-1]})")

# Create target years - evenly distributed
num_years = len(all_years)
step = max(1, num_years // total_typhoons)
target_year_indices = list(range(0, num_years, step))[:total_typhoons]
target_years = [all_years[i] for i in target_year_indices]

print(f"Target years for sampling: {len(target_years)}")
print(f"Sampling every ~{step} year(s)")

sampled_typhoons = []

for year in target_years:
    candidates = typhoon_info[typhoon_info['first_year'] == year]
    if len(candidates) > 0:
        selected = candidates.iloc[0]
        sampled_typhoons.append(int(selected['id']))

print(f"\nSelected {len(sampled_typhoons)} typhoons")

# Verify year distribution
sampled_info = typhoon_info[typhoon_info['id'].isin(sampled_typhoons)]
print(f"Year range in sample: {sampled_info['first_year'].min()} - {sampled_info['first_year'].max()}")
print(f"Unique years covered: {sampled_info['first_year'].nunique()}")

# Grade distribution
print(f"\nGrade distribution in sampled typhoons:")
grade_dist = sampled_info['peak_grade'].value_counts().sort_index()
for grade, count in grade_dist.items():
    print(f"  Grade {grade}: {count} typhoons")


Available years: 44 (1978 - 2022)
Target years for sampling: 44
Sampling every ~1 year(s)

Selected 44 typhoons
Year range in sample: 1978 - 2022
Unique years covered: 44

Grade distribution in sampled typhoons:
  Grade 3: 11 typhoons
  Grade 4: 7 typhoons
  Grade 5: 9 typhoons
  Grade 6: 16 typhoons
  Grade 7: 1 typhoons


In [18]:
# lifecycle-based image sampling per typhoon
all_sampled_rows = []

for typhoon_id in sampled_typhoons:
    # Get ALL rows for this typhoon
    typhoon_data = df[df['id'] == typhoon_id].copy()
    typhoon_data = typhoon_data.sort_values(['year','month','day','hour']).reset_index(drop=True)
    n_images = len(typhoon_data)
    
    # Decide how many images to sample
    if n_images <= images_per_typhoon:
        # Take ALL images if typhoon has fewer than target
        sampled_rows = typhoon_data
    else:
        # Sample from lifecycle
        n_sample = images_per_typhoon
        
        # Always include: early, peak, decay
        peak_idx = typhoon_data['grade'].idxmax()
        early_idx = 0
        decay_idx = n_images - 1
        
        sampled_indices = {early_idx, peak_idx, decay_idx}
        
        # Fill remaining with evenly spaced samples across lifecycle
        remaining = n_sample - len(sampled_indices)
        if remaining > 0:
            available = list(set(range(n_images)) - sampled_indices)
            if len(available) > 0:
                # Strategy 1: Evenly spaced (better lifecycle coverage)
                step_size = max(1, len(available) // remaining)
                evenly_spaced = available[::step_size][:remaining]
                sampled_indices.update(evenly_spaced)
                
                # If we still need more, fill with random
                if len(sampled_indices) < n_sample:
                    still_available = list(set(available) - set(evenly_spaced))
                    if still_available:
                        additional_needed = n_sample - len(sampled_indices)
                        additional = np.random.choice(
                            still_available, 
                            size=min(additional_needed, len(still_available)), 
                            replace=False
                        )
                        sampled_indices.update(additional)
        
        sampled_rows = typhoon_data.iloc[list(sampled_indices)]
    
    all_sampled_rows.append(sampled_rows)

# Concatenate all sampled rows
df_sampled = pd.concat(all_sampled_rows, ignore_index=True)

print(f"Total images: {len(df_sampled):,}")
print(f"Unique typhoons: {df_sampled['id'].nunique()}")
print(f"Unique years: {df_sampled['year'].nunique()}")
print(f"Average images per typhoon: {len(df_sampled) / df_sampled['id'].nunique():.1f}")

Total images: 2,080
Unique typhoons: 44
Unique years: 44
Average images per typhoon: 47.3


In [19]:
# copy sampled images to output directory
images_src_dir = os.path.join(root, "image")
images_dst_dir = os.path.join(output_dir, "image")
os.makedirs(images_dst_dir, exist_ok=True)

copied = 0
not_found = []

print(f"Copying {len(df_sampled)} images...")
for idx, row in df_sampled.iterrows():
    img_file = row['image_path']
    found = False
    
    for root_dir, dirs, files in os.walk(images_src_dir):
        if img_file in files:
            src = os.path.join(root_dir, img_file)
            rel_path = os.path.relpath(root_dir, images_src_dir)
            dst = os.path.join(images_dst_dir, rel_path, img_file)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(src, dst)
            copied += 1
            found = True
            break
    
    if not found:
        not_found.append(img_file)


print(f"\nCopied {copied}/{len(df_sampled)} files")
if not_found:
    print(f"Warning: {len(not_found)} files not found")
    print(f"First few missing files: {not_found[:5]}")

# save sampled auxiliary data
output_csv = os.path.join(output_dir, "aux_data.csv")
df_sampled.to_csv(output_csv, index=False)



Copying 2080 images...

Copied 2080/2080 files


In [20]:
# get sampled typhoon IDs
import json

sampled_csv = os.path.join(output_dir, "aux_data.csv")
if not os.path.exists(sampled_csv):
    print(f"ERROR: {sampled_csv} not found!")
    print("Please run the main sampling script first.")
    exit(1)

df_sampled = pd.read_csv(sampled_csv)
sampled_typhoon_ids = sorted(df_sampled['id'].unique())

print(f"Found {len(sampled_typhoon_ids)} unique typhoons in sampled data")

# Copy METADATA folder
metadata_src = os.path.join(root, "metadata")
metadata_dst = os.path.join(output_dir, "metadata")

if os.path.exists(metadata_src):
    
    # Remove existing metadata folder if it exists
    if os.path.exists(metadata_dst):
        print(f"Removing existing metadata folder: {metadata_dst}")
        shutil.rmtree(metadata_dst)
    
    # Copy entire metadata folder
    shutil.copytree(metadata_src, metadata_dst)

# Filter METADATA.JSON
metadata_json_src = os.path.join(root, "metadata.json")
metadata_json_dst = os.path.join(output_dir, "metadata.json")

if os.path.exists(metadata_json_src):
    
    # Load original metadata
    with open(metadata_json_src, 'r') as f:
        metadata = json.load(f)
    
    print(f"Original metadata entries: {len(metadata)}")
    
    # Filter for sampled typhoons
    # Convert sampled IDs to strings for comparison (JSON keys are strings)
    sampled_ids_str = {str(tid) for tid in sampled_typhoon_ids}
    
    filtered_metadata = {
        key: value 
        for key, value in metadata.items() 
        if key in sampled_ids_str
    }
    
    print(f"Filtered metadata entries: {len(filtered_metadata)}")
    
    # Save filtered metadata
    with open(metadata_json_dst, 'w') as f:
        json.dump(filtered_metadata, f, indent=2)
    
    # Verify
    if len(filtered_metadata) != len(sampled_typhoon_ids):
        print("WARNING: Mismatch between sampled typhoons and metadata entries!")
        print(f"  Sampled typhoons: {len(sampled_typhoon_ids)}")
        print(f"  Metadata entries: {len(filtered_metadata)}")
        
        missing = sampled_ids_str - set(filtered_metadata.keys())
        if missing:
            print(f"  Missing metadata for: {missing}")
else:
    print(f"WARNING: metadata.json not found at {metadata_json_src}\n")


Found 44 unique typhoons in sampled data
Original metadata entries: 1099
Filtered metadata entries: 44


In [None]:
# Import the sampling functions
from sample import load_data, sample_typhoons, sample_images, copy_images, save_sampled_data, copy_metadata

# Set paths and parameters
root = "/home/ogallo/DL4CV/DigitalTyphoon/WP"
output_dir = "/home/ogallo/DL4CV/WP_sampled_50"
total_typhoons = 50
images_per_typhoon = 50

# load data
df = load_data(root)
print(f"Loaded {len(df)} records.")

# Sample typhoons
sampled_typhoons = sample_typhoons(df, total_typhoons)
print(f"Selected {len(sampled_typhoons)} typhoons.")

# Sample images
df_sampled = sample_images(df, sampled_typhoons, images_per_typhoon)
print(f"Sampled {len(df_sampled)} images.")

# Copy images
copied, not_found = copy_images(df_sampled, root, output_dir)
print(f"Copied {copied}/{len(df_sampled)} images.")
if not_found:
    print(f"Warning: {len(not_found)} images not found.")

# Save sampled data
save_sampled_data(df_sampled, output_dir)

# Copy metadata
sampled_typhoon_ids = sorted(df_sampled['id'].unique())
copy_metadata(root, output_dir, sampled_typhoon_ids)


Loaded 189364 records.
Selected 44 typhoons.
Sampled 2080 images.


In [22]:
from torchgeo.datasets import DigitalTyphoon
dataset2 = DigitalTyphoon(
    root="/home/ogallo/DL4CV/WP_sampled_50",
    features=["wind", "pressure"],
    targets=["wind", "pressure"],
    sequence_length=1,
    download=False
)