# **Avalanche Risk Project**

Advanced Data Analytics, Fall 2025

The following project examines the feasibility of machine learning models to predict dry avalanche danger from spatial and meteorological features for the Engelberg Valley. In a second step, the model will be trained on the whole of Switzerland and tested as well. 

### **1. Data Collection**

I start by importing all necessary data from the different APIs including:

- SLF Bulletin Archive
- Meteo Swiss IMIS Data Archive
- SwissTopo Spatial Data

In [2]:
# Necessary Libraries 
import requests
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
import rasterio
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from bs4 import BeautifulSoup
import time


Apparently, the API only returns data from 2024 onwards. That means I have to scrape the data from the SLF archive using `Beautiful Soup`.

In [7]:
df = pd.read_csv('C:/users/fabio/Downloads/data_rf1_forecast.csv')
df

Unnamed: 0.1,Unnamed: 0,datum,station_code,sector_id,warnreg,elevation_station,forecast_initial_date,forecast_end_date,dangerLevel,elevation_th,...,ssi_pwl,sk38_pwl,sn38_pwl,ccl_pwl,ssi_pwl_100,sk38_pwl_100,sn38_pwl_100,ccl_pwl_100,Pen_depth,min_ccl_pen
0,0,1997-11-11,KES2,7113.0,15.0,2700.0,1997-11-11 17:00:00,1997-11-12 17:00:00,1.0,2000.0,...,2.02,1.02,1.92,0.30,2.02,1.02,1.92,0.30,44.028391,0.17
1,1,1997-11-11,SIM2,6113.0,15.0,2400.0,1997-11-11 17:00:00,1997-11-12 17:00:00,2.0,2000.0,...,6.00,6.00,6.00,4.00,6.00,6.00,6.00,4.00,37.271809,0.26
2,2,1997-11-11,DTR2,6113.0,15.0,2100.0,1997-11-11 17:00:00,1997-11-12 17:00:00,2.0,2000.0,...,1.44,0.44,1.37,0.12,1.44,0.44,1.37,0.12,38.369101,0.12
3,3,1997-11-11,MEI2,2221.0,15.0,2200.0,1997-11-11 17:00:00,1997-11-12 17:00:00,1.0,2000.0,...,6.00,6.00,6.00,0.20,6.00,6.00,6.00,0.20,20.400000,4.00
4,4,1997-11-11,SPN2,4232.0,15.0,2600.0,1997-11-11 17:00:00,1997-11-12 17:00:00,2.0,2000.0,...,6.00,6.00,6.00,4.00,6.00,6.00,6.00,4.00,42.332551,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292832,292832,2020-05-04,FIR2,1242.0,21.0,2100.0,2020-05-04 17:00:00,2020-05-05 17:00:00,2.0,1800.0,...,6.00,6.00,6.00,4.00,6.00,6.00,6.00,4.00,6.299643,3.00
292833,292833,2020-05-04,GRA2,1311.0,21.0,2000.0,2020-05-04 17:00:00,2020-05-05 17:00:00,2.0,1800.0,...,6.00,6.00,6.00,4.00,6.00,6.00,6.00,4.00,6.881834,3.00
292834,292834,2020-05-04,SHE2,1213.0,21.0,1900.0,2020-05-04 17:00:00,2020-05-05 17:00:00,2.0,1800.0,...,,,,,,,,,,
292835,292835,2020-05-04,ELS2,1231.0,21.0,2100.0,2020-05-04 17:00:00,2020-05-05 17:00:00,2.0,1800.0,...,,,,,,,,,,


In [13]:
print(df.columns)

Index(['Unnamed: 0', 'datum', 'station_code', 'sector_id', 'warnreg',
       'elevation_station', 'forecast_initial_date', 'forecast_end_date',
       'dangerLevel', 'elevation_th', 'set', 'Qs', 'Ql', 'TSG', 'Qg0', 'Qr',
       'OLWR', 'ILWR', 'LWR_net', 'OSWR', 'ISWR', 'Qw', 'pAlbedo', 'ISWR_h',
       'ISWR_diff', 'ISWR_dir', 'TA', 'TSS_mod', 'TSS_meas', 'T_bottom', 'RH',
       'VW', 'VW_drift', 'DW', 'MS_Snow', 'HS_mod', 'HS_meas', 'hoar_size',
       'wind_trans24', 'wind_trans24_7d', 'wind_trans24_3d', 'HN24', 'HN72_24',
       'HN24_7d', 'SWE', 'MS_water', 'MS_Wind', 'MS_Rain', 'MS_SN_Runoff',
       'MS_Sublimation', 'MS_Evap', 'TS0', 'TS1', 'TS2', 'Sclass2', 'zSd_mean',
       'Sd', 'zSn', 'Sn', 'zSs', 'Ss', 'zS4', 'S4', 'zS5', 'S5', 'pwl_100',
       'pwl_100_15', 'base_pwl', 'ssi_pwl', 'sk38_pwl', 'sn38_pwl', 'ccl_pwl',
       'ssi_pwl_100', 'sk38_pwl_100', 'sn38_pwl_100', 'ccl_pwl_100',
       'Pen_depth', 'min_ccl_pen'],
      dtype='object')


In [6]:
import requests
import pandas as pd
from io import StringIO
import time
from pathlib import Path

# Create data directory
Path("data").mkdir(exist_ok=True)

station_url = "https://measurement-data.slf.ch/imis/data/by_station/"

def fetch_available_stations():
    """
    Fetch list of available station CSV files from SLF
    """
      # <-- Full URL for listing
    
    try:
        response = requests.get(station_url, timeout=30)
        response.raise_for_status()
        
        # Parse HTML to find all CSV files
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links ending with .csv (but not _pluvio.csv)
        csv_files = []
        for link in soup.find_all('a'):
            href = link.get('href', '')
            if href.endswith('.csv') and not href.endswith('_pluvio.csv'):
                filename = href.split('/')[-1]  # Takes 'ADE2.csv' from '/imis/data/by_station/ADE2.csv'
                csv_files.append(filename)  # href is just the filename like 'ADE2.csv'
        
        print(f"Found {len(csv_files)} station files")
        return csv_files
    
    except Exception as e:
        print(f"Error fetching station list: {e}")
        return []

def fetch_station_data(station_file):
    """
    Fetch data for a single station CSV file
    """
    # Build full URL: base + path + filename
    url = f"{station_url}{station_file}"  # <-- FIXED
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        # Read CSV
        df = pd.read_csv(StringIO(response.text), sep=';', low_memory=False)
        
        # Extract station code from filename (e.g., 'ADE2.csv' -> 'ADE2')
        station_code = station_file.replace('.csv', '')
        df['station_code'] = station_code
        
        return df
    
    except Exception as e:
        print(f"Error fetching {station_file}: {e}")
        return None

def fetch_all_imis_data():
    """
    Fetch and combine all IMIS station data
    """
    print("Fetching list of available stations...")
    station_files = fetch_available_stations()
    
    if not station_files:
        print("No station files found!")
        return None
    
    print(f"\nFetching data for {len(station_files)} stations...")
    print("This may take several minutes...\n")
    
    all_data = []
    successful = 0
    failed = 0
    
    for i, station_file in enumerate(station_files, 1):
        print(f"[{i}/{len(station_files)}] Fetching {station_file}...", end=' ')
        
        df = fetch_station_data(station_file)
        
        if df is not None and not df.empty:
            all_data.append(df)
            successful += 1
            print(f"({len(df)} rows)")
        else:
            failed += 1
            print("Failed")
        
        # Be respectful to the server
        time.sleep(0.5)
    
    print(f"\n{'='*60}")
    print(f"Data collection complete!")
    print(f"Successful: {successful}/{len(station_files)}")
    print(f"Failed: {failed}/{len(station_files)}")
    print(f"{'='*60}\n")
    
    if not all_data:
        print("No data collected!")
        return None
    
    # Combine all dataframes
    print("Combining all station data...")
    df_combined = pd.concat(all_data, ignore_index=True)
    
    print(f"Combined dataset shape: {df_combined.shape}")
    print(f"Columns: {list(df_combined.columns)}")
    print(f"Unique stations: {df_combined['station_code'].nunique()}")
    
    return df_combined

# Execute the data collection
df_all_stations = fetch_all_imis_data()

# Save to CSV
if df_all_stations is not None:
    output_file = 'data/imis_all_stations.csv'
    df_all_stations.to_csv(output_file, index=False)
    print(f"\n✓ Saved to: {output_file}")
    
    # Display basic info
    print(f"\nDataset Overview:")
    print(f"Total rows: {len(df_all_stations):,}")
    print(f"Total columns: {len(df_all_stations.columns)}")
    print(f"\nStations included:")
    print(df_all_stations['station_code'].value_counts().head(20))

Fetching list of available stations...
Found 249 station files

Fetching data for 249 stations...
This may take several minutes...

[1/249] Fetching ADE2.csv... (49714 rows)
[2/249] Fetching ADE3.csv... (86572 rows)
[3/249] Fetching ALB2.csv... (105770 rows)
[4/249] Fetching ALI1.csv... (366371 rows)
[5/249] Fetching ALI2.csv... (366255 rows)
[6/249] Fetching AMD1.csv... (488963 rows)
[7/249] Fetching AMD2.csv... (474407 rows)
[8/249] Fetching ANV1.csv... (490298 rows)
[9/249] Fetching ANV2.csv... (488028 rows)
[10/249] Fetching ANV3.csv... (487514 rows)
[11/249] Fetching ARO1.csv... (478877 rows)
[12/249] Fetching ARO2.csv... (470824 rows)
[13/249] Fetching ARO3.csv... (481778 rows)
[14/249] Fetching ATT2.csv... (397188 rows)
[15/249] Fetching AUE2.csv... (12259 rows)
[16/249] Fetching BAG1.csv... (50063 rows)
[17/249] Fetching BAR2.csv... (9955 rows)
[18/249] Fetching BED1.csv... (501100 rows)
[19/249] Fetching BED2.csv... (425053 rows)
[20/249] Fetching BED3.csv... (490576 rows)
[21

In [None]:
imis_df = pd.read_csv('data/imis_all_stations.csv')

In [11]:
df_Engelberg = df[df['station_code'] == 'TIT2']
df_Engelberg
print(df_Engelberg.shape)

(1013, 78)


In [12]:
df_Ebi = pd.read_csv('C:/users/fabio/Downloads/TIT2.csv')
df_Ebi

Unnamed: 0,station_code,measure_date,hyear,TA_30MIN_MEAN,VW_30MIN_MEAN,VW_30MIN_MAX,DW_30MIN_MEAN,RH_30MIN_MEAN,HS,TS0_30MIN_MEAN,TS25_30MIN_MEAN,TS50_30MIN_MEAN,TS100_30MIN_MEAN,RSWR_30MIN_MEAN,TSS_30MIN_MEAN
0,TIT2,1993-06-01 00:40:00+00:00,1993,,,,,,139.0,,,,,,
1,TIT2,1993-06-01 01:40:00+00:00,1993,,,,,,139.0,,,,,,
2,TIT2,1993-06-01 02:40:00+00:00,1993,,,,,,139.0,,,,,,
3,TIT2,1993-06-01 03:40:00+00:00,1993,,,,,,139.0,,,,,,
4,TIT2,1993-06-01 04:40:00+00:00,1993,,,,,,138.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400721,TIT2,2025-10-20 01:30:00+00:00,2026,5.9,1.8,3.9,101.0,39.1,-1.2,1.9,3.8,4.1,4.6,0.0,1.1
400722,TIT2,2025-10-20 02:00:00+00:00,2026,5.8,1.9,3.7,111.7,39.4,-1.2,1.5,3.5,3.9,4.5,0.0,0.6
400723,TIT2,2025-10-20 02:30:00+00:00,2026,6.4,2.5,3.9,104.9,37.1,-0.8,2.1,4.9,5.2,5.6,0.0,2.5
400724,TIT2,2025-10-20 03:00:00+00:00,2026,6.1,1.6,3.9,96.5,39.8,-1.1,2.2,4.4,4.7,5.0,0.0,2.3


In [16]:
import requests
import pandas as pd
import io
from datetime import datetime
import time
import json

def get_slf_data_aggregated(station_code, year, data_type="AGG"):
    """
    Get SLF data based on the actual structure shown in your image
    """
    # SLF typically uses API endpoints or specific data portals
    base_url = "https://measurement-data.slf.ch/api/v1"
    
    # Headers that might be needed for SLF API
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json',
    }
    
    # Try different endpoint approaches
    endpoints = [
        f"{base_url}/stations/{station_code}/data/{year}",
        f"{base_url}/data/station/{station_code}",
        f"https://measurement-data.slf.ch/imis/options.php?station={station_code}&year={year}"
    ]
    
    for url in endpoints:
        print(f"Trying: {url}")
        
        try:
            response = requests.get(url, headers=headers, timeout=30)
            
            if response.status_code == 200:
                # Try to parse as JSON first
                try:
                    data = response.json()
                    print(f"JSON response received for {station_code}")
                    return process_slf_json(data, station_code, year)
                except json.JSONDecodeError:
                    # Try CSV format
                    try:
                        df = pd.read_csv(io.StringIO(response.text))
                        print(f"CSV response received for {station_code}")
                        return process_slf_csv(df, station_code, year)
                    except:
                        print(f"Could not parse response as JSON or CSV")
                        return None
            
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {url}: {e}")
            continue
    
    print(f"No successful endpoints for {station_code}")
    return None

def process_slf_json(data, station_code, year):
    """Process JSON response from SLF API"""
    # This would need to be adapted based on actual API response
    print(f"Raw JSON keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}")
    return pd.DataFrame()  # Placeholder

def process_slf_csv(df, station_code, year):
    """Process CSV response from SLF"""
    print(f"CSV columns: {list(df.columns)}")
    print(f"Data shape: {df.shape}")
    
    # Handle European number format (comma as decimal separator)
    for col in df.columns:
        if df[col].dtype == 'object':
            # Try to convert comma decimals to dot decimals
            try:
                df[col] = df[col].str.replace(',', '.').astype(float)
            except:
                pass
    
    return df

def get_slf_data_direct_files():
    """
    Alternative approach: Direct file download based on your image structure
    """
    # From your image, it seems files are named AGG.csv
    base_files_url = "https://measurement-data.slf.ch/imis/data/files/"
    
    files_to_try = [
        "AGG.csv",
        "AGG_devicesv", 
        f"AGG_{datetime.now().year}.csv"
    ]
    
    for filename in files_to_try:
        url = f"{base_files_url}{filename}"
        print(f"Trying direct file: {url}")
        
        try:
            response = requests.get(url, timeout=30)
            if response.status_code == 200:
                df = pd.read_csv(io.StringIO(response.text))
                print(f"Successfully downloaded {filename}")
                return df
        except:
            continue
    
    return None

def test_slf_download():
    """Test SLF data download with realistic approaches"""
    
    # Stations that likely exist in SLF system
    test_stations = [
        "WFJ",  # Weissfluhjoch
        "DAV",  # Davos
        "ARB",  # Arosa
        "JFR",  # Jungfraujoch
        "SLF",  # SLF headquarters
    ]
    
    test_year = 2023
    
    print("Method 1: API Approach")
    print("=" * 60)
    
    for station in test_stations:
        print(f"\nTrying station: {station}")
        data = get_slf_data_aggregated(station, test_year)
        
        if data is not None and not data.empty:
            print(f"Success! Data shape: {data.shape}")
            print(f"Columns: {list(data.columns)}")
            print(data.head())
        else:
            print(f"No data retrieved for {station}")
        
        time.sleep(2)  # Be respectful to the server
    
    print("\n" + "=" * 60)
    print("Method 2: Direct File Download")
    print("=" * 60)
    
    direct_data = get_slf_data_direct_files()
    if direct_data is not None:
        print(f"Direct file data shape: {direct_data.shape}")
        print(direct_data.head())

# Additional function to handle the specific format from your image
def parse_slf_gainers_data(csv_text):
    """
    Parse data in the specific 'Gainers | Cells' format from your image
    """
    df = pd.read_csv(io.StringIO(csv_text), sep='|')
    
    # Clean column names and data
    df.columns = [col.strip() for col in df.columns]
    
    # Handle the specific format you showed
    if 'Gainers' in df.columns and 'Cells' in df.columns:
        # Convert European number format
        df['Cells'] = df['Cells'].str.replace(',', '.').astype(float)
        
        # Parse the Gainers column (appears to be dates/identifiers)
        df['Gainers_parsed'] = df['Gainers'].str.strip()
    
    return df

# Run the test
if __name__ == "__main__":
    test_slf_download()

Method 1: API Approach

Trying station: WFJ
Trying: https://measurement-data.slf.ch/api/v1/stations/WFJ/data/2023
Trying: https://measurement-data.slf.ch/api/v1/data/station/WFJ
Trying: https://measurement-data.slf.ch/imis/options.php?station=WFJ&year=2023
No successful endpoints for WFJ
No data retrieved for WFJ

Trying station: DAV
Trying: https://measurement-data.slf.ch/api/v1/stations/DAV/data/2023
Trying: https://measurement-data.slf.ch/api/v1/data/station/DAV
Trying: https://measurement-data.slf.ch/imis/options.php?station=DAV&year=2023
No successful endpoints for DAV
No data retrieved for DAV

Trying station: ARB
Trying: https://measurement-data.slf.ch/api/v1/stations/ARB/data/2023
Trying: https://measurement-data.slf.ch/api/v1/data/station/ARB
Trying: https://measurement-data.slf.ch/imis/options.php?station=ARB&year=2023
No successful endpoints for ARB
No data retrieved for ARB

Trying station: JFR
Trying: https://measurement-data.slf.ch/api/v1/stations/JFR/data/2023
Trying: ht

In [8]:
# Fetch all bulletins from 2014-2024 for Davos region
start_date = datetime(2014, 11, 1)  # Winter season starts in November
end_date = datetime(2024, 5, 31)    # Winter season ends in May
current_date = start_date

# Extract data into list of records
records = []
failed_dates = []
all_region_names = set()  # To track unique region names

print("Fetching bulletins from 2014 to 2024 for Davos region...")
print(f"Date range: {start_date.date()} to {end_date.date()}\n")

# Iterate through all dates
while current_date <= end_date:
    try:
        bulletin_json = fetch_bulletin_json(current_date)
        
        # Check if bulletins exist
        if not bulletin_json.get('bulletins'):
            failed_dates.append(current_date.date())
            current_date += timedelta(days=1)
            continue
        
        for bulletin in bulletin_json.get('bulletins', []):
            regions = bulletin.get('regions', [])
            
            # Collect region names and IDs
            region_names = []
            region_ids = []
            for r in regions:
                name = r.get('name', '')
                region_id = r.get('regionId', '')
                region_names.append(name)
                region_ids.append(region_id)
                all_region_names.add(name)
            
            # Filter for Davos region - check both name and ID
            # Davos region ID is typically "CH-7114" or similar
            is_davos = any(
                'davos' in name.lower() or 
                'davos' in rid.lower() or
                'CH-7114' in rid or
                'CH-7115' in rid
                for name, rid in zip(region_names, region_ids)
            )
            
            if is_davos:
                danger_ratings = bulletin.get('dangerRatings', [])
                for rating in danger_ratings:
                    records.append({
                        'date': current_date.date(),
                        'regions': ', '.join(region_names),
                        'region_ids': ', '.join(region_ids),
                        'danger_level': rating.get('mainValue', 'N/A'),
                        'elevation_lower': rating.get('validElevation', {}).get('lowerBound'),
                        'elevation_upper': rating.get('validElevation', {}).get('upperBound'),
                        'aspects': ', '.join(rating.get('aspects', []))
                    })
        
        # Print progress every 100 days
        if (current_date - start_date).days % 100 == 0:
            print(f"Progress: {current_date.date()} - Records collected: {len(records)}")
            
    except Exception as e:
        failed_dates.append(current_date.date())
    
    # Move to next day
    current_date += timedelta(days=1)

# Convert to DataFrame
df_bulletins = pd.DataFrame(records)

print(f"\n{'='*60}")
print(f"Data collection complete!")
print(f"Total records: {len(records)}")
print(f"Failed dates: {len(failed_dates)}")
print(f"DataFrame shape: {df_bulletins.shape}")
if len(df_bulletins) > 0:
    print(f"Date range in data: {df_bulletins['date'].min()} to {df_bulletins['date'].max()}")
print(f"Unique region names found: {len(all_region_names)}")
print(f"First few region names: {list(all_region_names)[:10]}")
print(f"{'='*60}\n")

df_bulletins.head(10)

Fetching bulletins from 2014 to 2024 for Davos region...
Date range: 2014-11-01 to 2024-05-31

Progress: 2023-11-14 - Records collected: 14
Progress: 2023-11-14 - Records collected: 14
Progress: 2024-02-22 - Records collected: 114
Progress: 2024-02-22 - Records collected: 114

Data collection complete!
Total records: 209
Failed dates: 3294
DataFrame shape: (209, 7)
Date range in data: 2023-11-01 to 2024-05-31
Unique region names found: 142
First few region names: ['Génépi', 'nördliches Tujetsch', 'Jungfrau - Schilthorn', 'Bernina', 'Stoos', 'Toggenburg', 'Blüemlisalp', 'Saint-Cergue', 'Maderanertal', 'Val dal Spöl']


Data collection complete!
Total records: 209
Failed dates: 3294
DataFrame shape: (209, 7)
Date range in data: 2023-11-01 to 2024-05-31
Unique region names found: 142
First few region names: ['Génépi', 'nördliches Tujetsch', 'Jungfrau - Schilthorn', 'Bernina', 'Stoos', 'Toggenburg', 'Blüemlisalp', 'Saint-Cergue', 'Maderanertal', 'Val dal Spöl']



Unnamed: 0,date,regions,region_ids,danger_level,elevation_lower,elevation_upper,aspects
0,2023-11-01,"Bex-Villars, Wildhorn, Iffigen, Engstligen, Bl...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
1,2023-11-02,"Gadmertal, Engelberg, Schächental, Uri Rot Sto...",", , , , , , , , , , , , , , , , , , , , , ,",moderate,,,
2,2023-11-03,"Gadmertal, Engelberg, Schächental, Uri Rot Sto...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
3,2023-11-04,"Engstligen, Blüemlisalp, Jungfrau - Schilthorn...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
4,2023-11-05,"Waadtländer Voralpen, Jaun, Hohgant, Niedersim...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
5,2023-11-06,"Waadtländer Voralpen, Jaun, Hohgant, Niedersim...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
6,2023-11-07,"Pays d'Enhaut, Aigle-Leysin, Gstaad, Lenk, Ade...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
7,2023-11-08,"Pays d'Enhaut, Aigle-Leysin, Gstaad, Lenk, Ade...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
8,2023-11-09,"Pays d'Enhaut, Aigle-Leysin, Gstaad, Lenk, Eng...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
9,2023-11-10,"Ybrig, Stoos, Bisistal, Glarus Nord, Glarus Sü...",", , , , , , , , , , , , , , , , , , , , , , , ...",moderate,,,
