### Deep Learning for Computer Vision  
### Multi-Task Regression with the Digital Typhoon Dataset

This notebook demonstrates a **supervised multi-task regression** workflow for remote sensing using **TorchGeo** using the Digital Typhoon dataset, which consists of infrared (IR) satellite imagery of tropical cyclones paired with meteorological measurements.

The objective is to predict multiple continuous typhoon intensity variables from satellite imagery using a deep learning model.  

#### Dataset Overview
The [Digital Typhoon](https://torchgeo.readthedocs.io/en/stable/api/datasets.html#digital-typhoon) is derived from hourly infrared channel observations captured by multiple generations of the Himawari meteorological satellites, spanning the period from 1978 to the present. The satellite measurements have been converted to brightness temperatures and normalized across different sensors, resulting in a consistent spatio-temporal dataset covering more than four decades.  

**Dataset features:**
- Infrared (IR) satellite imagery of 512 Ã— 512 pixels at ~5km resolution 
- Auxiliary metadata including wind speed, pressure and additional typhoon-related attributes  
- 1,099 typhoons and 189,364 images

**References**  
Digital Typhoon Dataset: *A Large-Scale Benchmark for Tropical Cyclone Analysis*      [arXiv:2411.16421](https://arxiv.org/pdf/2411.16421) ; [arXiv:2311.02665](https://arxiv.org/pdf/2311.02665)


In [1]:
## import libraries
import os
import shutil
import pandas as pd
import torch

from torch.utils.data import DataLoader
from torchgeo.datasets import DigitalTyphoon


In [None]:
# load dataset
root = "/home/ogallo/DL4CV/DigitalTyphoon"

dataset = DigitalTyphoon(
    root=root,
    features=["wind", "pressure"],
    targets=["wind", "pressure"],
    sequence_length=1,
    download=False
)


In [3]:
print(len(dataset))        # number of sequences
print(dataset[0])          # inspect the first sequence

173418
{'image': tensor([[[0.7248, 0.7813, 0.7813,  ..., 0.9331, 0.9363, 0.9331],
         [0.7248, 0.7576, 0.7735,  ..., 0.9331, 0.9331, 0.9331],
         [0.7290, 0.7536, 0.7656,  ..., 0.9299, 0.9299, 0.9331],
         ...,
         [0.6904, 0.6495, 0.6007,  ..., 0.8483, 0.8798, 0.8659],
         [0.6542, 0.6400, 0.6725,  ..., 0.8483, 0.8659, 0.8447],
         [0.7373, 0.7536, 0.7967,  ..., 0.8518, 0.8518, 0.8483]]]), 'wind': tensor(-1.1229), 'pressure': tensor(0.5422), 'label': tensor([-1.1229,  0.5422])}


In [4]:
aux_data = pd.read_csv("/home/ogallo/DL4CV/DigitalTyphoon/WP/aux_data.csv")
print(aux_data.head())     # inspect auxiliary data

       id                   image_path  year  month  day  hour  grade    lat  \
0  197830  1978120100-197830-GMS1-1.h5  1978     12    1     0      6  36.00   
1  197830  1978120103-197830-GMS1-1.h5  1978     12    1     3      6  37.46   
2  197830  1978120106-197830-GMS1-1.h5  1978     12    1     6      6  39.00   
3  197901  1978123112-197901-GMS1-1.h5  1978     12   31    12      2   2.00   
4  197901  1978123116-197901-GMS1-1.h5  1978     12   31    16      2   2.30   

      lng  pressure  wind  dir50  long50  short50  dir30  long30  short30  \
0  174.00     996.0   0.0      0       0        0      0       0        0   
1  176.44     994.0   0.0      0       0        0      0       0        0   
2  179.00     992.0   0.0      0       0        0      0       0        0   
3  172.00    1004.0   0.0      0       0        0      0       0        0   
4  171.81    1002.7   0.0      0       0        0      0       0        0   

   landfall  intp  
0         0     0  
1         0     

In [20]:
aux_data['year'].unique()

array([1978, 1979, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
       1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

Subset the dataset based on the typhoon grade, number of typhoons and lifecycle??

In [8]:
# sampling startegy
import numpy as np

root = "/home/ogallo/DL4CV/DigitalTyphoon/WP"
output_dir = "/home/ogallo/DL4CV/WP"
total_typhoons = 30
max_total_images = 1500
np.random.seed(42)

# load auxiliary data
aux_path = os.path.join(root, "aux_data.csv")
df = pd.read_csv(aux_path)


# Aggregate to get first year and peak grade per typhoon
typhoon_info = df.groupby('id').agg({
    'year': 'min',
    'grade': 'max'
}).reset_index().rename(columns={'year':'first_year', 'grade':'peak_grade'})

# Stratify by years
all_years = sorted(typhoon_info['first_year'].unique())

# Create target years - evenly distributed
num_years = len(all_years)
step = max(1, num_years // total_typhoons)
target_year_indices = list(range(0, num_years, step))[:total_typhoons]
target_years = [all_years[i] for i in target_year_indices]

sampled_typhoons = []

for year in target_years:
    candidates = typhoon_info[typhoon_info['first_year'] == year]
    if len(candidates) > 0:
        selected = candidates.iloc[0]
        sampled_typhoons.append(int(selected['id']))

print(f"Selected {len(sampled_typhoons)} typhoons")



Selected 30 typhoons


In [9]:
# Sample images per typhoon

all_sampled_rows = []

for typhoon_id in sampled_typhoons:
    # Get ALL rows for this typhoon from the original dataframe
    typhoon_data = df[df['id'] == typhoon_id].copy()
    typhoon_data = typhoon_data.sort_values(['year','month','day','hour']).reset_index(drop=True)
    n_images = len(typhoon_data)
    
    
    # How many images to sample per typhoon
    n_sample = min(n_images, max(5, int(max_total_images / total_typhoons)))
    
    if n_images <= n_sample:
        # Take all images if we have fewer than target
        sampled_rows = typhoon_data
    else:
        # Sample lifecycle stages
        peak_idx = typhoon_data['grade'].idxmax()
        early_idx = 0
        decay_idx = max(0, n_images - 1)
        
        sampled_indices = {early_idx, peak_idx, decay_idx}
        
        # Fill remaining randomly
        remaining = n_sample - len(sampled_indices)
        if remaining > 0:
            available = list(set(range(n_images)) - sampled_indices)
            if len(available) > 0:
                additional = np.random.choice(available, size=min(remaining, len(available)), replace=False)
                sampled_indices.update(additional)
        
        sampled_rows = typhoon_data.iloc[list(sampled_indices)]
    
    all_sampled_rows.append(sampled_rows)

# Concatenate all sampled rows
df_sampled = pd.concat(all_sampled_rows, ignore_index=True)

print(f"Total images: {len(df_sampled):,}")
print(f"Unique typhoons: {df_sampled['id'].nunique()}")
print(f"Unique years: {df_sampled['year'].nunique()}")

Total images: 1,380
Unique typhoons: 30
Unique years: 30


In [10]:
# Copy sample images to output directory
images_src_dir = os.path.join(root, "image")
images_dst_dir = os.path.join(output_dir, "image")
os.makedirs(images_dst_dir, exist_ok=True)

copied = 0
not_found = []

for idx, row in df_sampled.iterrows():
    img_file = row['image_path']
    found = False
    
    for root_dir, dirs, files in os.walk(images_src_dir):
        if img_file in files:
            src = os.path.join(root_dir, img_file)
            rel_path = os.path.relpath(root_dir, images_src_dir)
            dst = os.path.join(images_dst_dir, rel_path, img_file)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(src, dst)
            copied += 1
            found = True
            break
    
    if not found:
        not_found.append(img_file)
    

print(f"\nCopied {copied}/{len(df_sampled)} files")
if not_found:
    print(f"Warning: {len(not_found)} files not found")

# Save CSV
output_csv = os.path.join(output_dir, "aux_data.csv")
df_sampled.to_csv(output_csv, index=False)
print(f"\nSaved aux_data.csv to {output_dir}")



Copied 1380/1380 files

Saved aux_data.csv to /home/ogallo/DL4CV/WP


In [12]:
from torchgeo.datasets import DigitalTyphoon
dataset2 = DigitalTyphoon(
    root="/home/ogallo/DL4CV",
    features=["wind", "pressure"],
    targets=["wind", "pressure"],
    sequence_length=1,
    download=False
)