In [187]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime, timedelta
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [188]:
#df = pd.read_csv('data_by_location/Kasurila_data.csv') # Error 8.88
#df = pd.read_csv('data_by_location/Levi_data.csv') # Error 17.05
#df = pd.read_csv('data_by_location/Luosto_data.csv') # Error 13.97
#df = pd.read_csv('data_by_location/Messila_data.csv') # Error 6.61
#df = pd.read_csv('data_by_location/Mustavaara_data.csv') # Error 9.38
#df = pd.read_csv('data_by_location/Ounasvaara_data.csv') # Error 12.27
#df = pd.read_csv('data_by_location/Purnu_data.csv') # Error 8.10
#df = pd.read_csv('data_by_location/Ruka_data.csv') # Error 11.46
#df = pd.read_csv('data_by_location/Ruunarinteet_data.csv') # Error 7.28
#df = pd.read_csv('data_by_location/Salla_data.csv') # Error 13.26

In [189]:
def preprocess_with_time_features(file_path):
    df = pd.read_csv(file_path)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['date'], keep='last')
    
    # Replace values
    df.replace(to_replace='-', value=np.nan, inplace=True)
    df.snow_depth_cm = df.snow_depth_cm.replace(to_replace='-1', value='0')

    # Delete rows where snow_depth is null
    df = df.dropna(subset=['snow_depth_cm'])

    # Convert to numeric
    df['avg_temp_c'] = pd.to_numeric(df['avg_temp_c'], errors='coerce')
    df['snow_depth_cm'] = pd.to_numeric(df['snow_depth_cm'], errors='coerce')
    df['uv_index'] = pd.to_numeric(df['uv_index'], errors='coerce')
    
    # Create lag features
    df['snow_depth_1d_ago'] = df['snow_depth_cm'].shift(1)
    df['snow_depth_7d_ago'] = df['snow_depth_cm'].shift(7)
    df['snow_depth_365d_ago'] = df['snow_depth_cm'].shift(365)

    # Convert date string to datetime object
    df['date'] = pd.to_datetime(df['date'])
    
    # Extract time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_year'] = df['date'].dt.dayofyear
    
    # Drop rows with missing target values
    df = df.dropna(subset=['snow_depth_cm'])
    
    return df

In [190]:
# Combine 4 location datasets for training the model
datasets = [
    'data_by_location/Messila_data.csv',
    'data_by_location/Purnu_data.csv',
    'data_by_location/Ruunarinteet_data.csv',
    'data_by_location/Kasurila_data.csv'
]

all_data = []

for dataset in datasets:
    try:
        df = preprocess_with_time_features(dataset)
        all_data.append(df)
    except Exception as e:
        print(f"Error processing {dataset}: {e}")

combined_df = pd.concat(all_data)

In [191]:
combined_df.columns

Index(['date', 'snow_depth_cm', 'avg_temp_c', 'uv_index', 'cloud_cover_rate',
       'cloud_cover', 'location', 'snow_depth_1d_ago', 'snow_depth_7d_ago',
       'snow_depth_365d_ago', 'year', 'month', 'day', 'day_of_year'],
      dtype='object')

In [192]:
combined_df.sample(15)

Unnamed: 0,date,snow_depth_cm,avg_temp_c,uv_index,cloud_cover_rate,cloud_cover,location,snow_depth_1d_ago,snow_depth_7d_ago,snow_depth_365d_ago,year,month,day,day_of_year
2713,2010-08-13,0.0,20.5,,1.0,Clear,Purnu,0.0,0.0,0.0,2010,8,13,225
2150,2009-01-27,22.0,-4.4,0.0,8.0,Cloudy,Kasurila,20.0,10.0,17.0,2009,1,27,27
942,2005-10-07,0.0,8.3,,0.0,Clear,Messila,0.0,0.0,0.0,2005,10,7,280
4121,2014-06-21,0.0,8.4,1.0,2.0,Mostly clear,Ruunarinteet,0.0,0.0,0.0,2014,6,21,172
4313,2014-12-30,14.0,-4.4,0.0,7.0,Mostly cloudy,Purnu,12.0,12.0,0.0,2014,12,30,364
5228,2017-07-02,0.0,18.1,1.3,0.0,Clear,Kasurila,0.0,0.0,0.0,2017,7,2,183
2751,2010-09-20,0.0,9.8,,7.0,Mostly cloudy,Kasurila,0.0,0.0,0.0,2010,9,20,263
5143,2017-04-08,14.0,-0.1,0.6,8.0,Cloudy,Ruunarinteet,17.0,26.0,7.0,2017,4,8,98
1795,2008-02-07,33.0,-0.1,0.0,8.0,Cloudy,Purnu,34.0,24.0,43.0,2008,2,7,38
5416,2018-01-06,41.0,0.2,0.0,7.0,Mostly cloudy,Kasurila,43.0,48.0,21.0,2018,1,6,6


In [193]:
combined_df.dtypes

date                   datetime64[ns]
snow_depth_cm                 float64
avg_temp_c                    float64
uv_index                      float64
cloud_cover_rate              float64
cloud_cover                    object
location                       object
snow_depth_1d_ago             float64
snow_depth_7d_ago             float64
snow_depth_365d_ago           float64
year                            int32
month                           int32
day                             int32
day_of_year                     int32
dtype: object

In [194]:
# Remove duplicates (if same location and date)
combined_df = combined_df.drop_duplicates(subset=['date', 'location'], keep='last')

In [195]:
combined_df.shape

(30076, 14)

In [196]:
combined_df.location.value_counts()

location
Ruunarinteet    8021
Messila         7843
Purnu           7163
Kasurila        7049
Name: count, dtype: int64

In [197]:
# Create an XGBoost regressor model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

X = combined_df[['avg_temp_c', 
                 'cloud_cover_rate',
                 'uv_index', 
                 'snow_depth_1d_ago', 
                 'snow_depth_7d_ago', 
                 'snow_depth_365d_ago', 
                 'year', 
                 'month', 
                 'day', 
                 'day_of_year']]  # Features 

y = combined_df['snow_depth_cm']  # Target variable (snow depth)

# Split into training (80%) and test (20%) data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

joblib.dump(model, 'snow_depth_time_model.joblib')

['snow_depth_time_model.joblib']

In [198]:
# Calculate predictions for testing the model
predictions = model.predict(X_test)

In [199]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.7393424588231221
Mean Squared Error: 3.407291535851011


In [200]:
# Function to predict future years
def predict_future_snow_depth(model, start_date, days=365, location_data=None):
    """
    Predict snow depth for future dates
    
    Parameters:
    - model: Trained model
    - start_date: Start date for predictions (string YYYY-MM-DD or datetime)
    - days: Number of days to predict forward
    - location_data: Sample data from a location to use as base values
    
    Returns:
    - DataFrame with dates and predicted snow depths
    """
    if isinstance(start_date, str):
        start_date = pd.to_datetime(start_date)
    
    # Create date range
    future_dates = [start_date + timedelta(days=i) for i in range(days)]
    future_df = pd.DataFrame({'date': future_dates})

    if location_data is not None:
        past_snow_data = location_data[['date', 'snow_depth_cm']].copy()
    
        past_snow_data['snow_depth_1d_ago'] = past_snow_data['snow_depth_cm'].shift(1)
        past_snow_data['snow_depth_7d_ago'] = past_snow_data['snow_depth_cm'].shift(7)
        past_snow_data['snow_depth_365d_ago'] = past_snow_data['snow_depth_cm'].shift(365)
    
        future_df = future_df.merge(past_snow_data[['date', 'snow_depth_1d_ago', 'snow_depth_7d_ago', 'snow_depth_365d_ago']], 
                                on='date', how='left')
    
    # Extract time features
    future_df['year'] = future_df['date'].dt.year
    future_df['month'] = future_df['date'].dt.month
    future_df['day'] = future_df['date'].dt.day
    future_df['day_of_year'] = future_df['date'].dt.dayofyear
    
    # Generate weather features based on historical averages by day of year
    if location_data is not None:
        # Group location data by day of year and get averages
        daily_averages = location_data.groupby('day_of_year').agg({
            'avg_temp_c': 'mean',
            'uv_index': 'mean',
            'cloud_cover_rate': 'mean'
        }).reset_index()
        
        # Merge with future dates
        future_df = future_df.merge(daily_averages, on='day_of_year', how='left')
        
        # For days not in historical data, use nearest day
        future_df = future_df.fillna(method='ffill').fillna(method='bfill')
    else:
        # If no location data is provided, use seasonal patterns
        # This is simplified - would be better with actual historical weather data
        future_df['month_rad'] = future_df['month'] * 2 * np.pi / 12
        
        # Temperature follows seasonal cycle (simplified model)
        # Northern hemisphere: coldest in Jan/Feb, warmest in Jul/Aug
        future_df['avg_temp_c'] = -10 * np.cos(future_df['month_rad']) + 5
        
        # UV follows similar seasonal pattern
        future_df['uv_index'] = 3 * np.cos((future_df['month_rad'] + np.pi)) + 3
        
        # Cloud cover (simplified)
        future_df['cloud_cover_rate'] = 0.5 + 0.2 * np.sin(future_df['month_rad'])
    
    # Prepare features for prediction in the same format as training data
    X_future = future_df[['avg_temp_c', 
                          'cloud_cover_rate',
                          'uv_index', 
                          'snow_depth_1d_ago', 
                          'snow_depth_7d_ago', 
                          'snow_depth_365d_ago', 
                          'year', 
                          'month', 
                          'day', 
                          'day_of_year']]
    
    # Make predictions
    future_df['predicted_snow_depth'] = model.predict(X_future)
    
    # Ensure non-negative snow depths
    future_df['predicted_snow_depth'] = future_df['predicted_snow_depth'].clip(lower=0)
    
    return future_df[['date', 
                      'predicted_snow_depth', 
                      'avg_temp_c', 
                      'uv_index', 
                      'cloud_cover_rate']]

In [201]:
# Create prediction files for locations
# 1. Load a sample location dataset for reference weather patterns
sample_location_salla = preprocess_with_time_features('data_by_location/Salla_data.csv')

# 2. Load the trained model
trained_model = joblib.load('snow_depth_time_model.joblib')

# 3. Predict snow depth for next year starting from a specific date
start_prediction_date = '2005-01-01'
future_predictions_salla = predict_future_snow_depth(
    model=trained_model,
    start_date=start_prediction_date,
    days=365*40,
    location_data=sample_location_salla
)
future_predictions_salla['location'] = 'Salla'

  future_df = future_df.fillna(method='ffill').fillna(method='bfill')


In [202]:
future_predictions_salla.to_csv('snow_depth_predictions_Salla_2025.csv', index=False)
print("Future predictions saved csv-file")

Future predictions saved csv-file
