In [1]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
import joblib

# List of datasets to use for training
datasets = [
    'data_by_location/Messila_data.csv',
    'data_by_location/Purnu_data.csv',
    'data_by_location/Ruunarinteet_data.csv',
    'data_by_location/Kasurila_data.csv'
]

# Function to load and preprocess a dataset
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['date'], keep='last')
    
    # Handle missing values
    df.replace(to_replace='-', value=np.nan, inplace=True)
    df.snow_depth_cm = df.snow_depth_cm.replace(to_replace='-1', value='0')
    
    # Convert to numeric
    df['avg_temp_c'] = pd.to_numeric(df['avg_temp_c'], errors='coerce')
    df['snow_depth_cm'] = pd.to_numeric(df['snow_depth_cm'], errors='coerce')
    df['uv_index'] = pd.to_numeric(df['uv_index'], errors='coerce')
    
    # Drop rows with missing target values
    df = df.dropna(subset=['snow_depth_cm'])
    
    return df

# Initialize empty lists for combined features and target
X_combined = []
y_combined = []

# Load and combine datasets
for dataset_path in datasets:
    try:
        print(f"Processing {dataset_path}...")
        df = load_and_preprocess(dataset_path)
        
        # Extract features and target
        X = df[['avg_temp_c', 'uv_index', 'cloud_cover_rate']]
        y = df['snow_depth_cm']
        
        # Append to combined lists
        X_combined.append(X)
        y_combined.append(y)
        
    except Exception as e:
        print(f"Error processing {dataset_path}: {e}")

# Combine all datasets
X_all = pd.concat(X_combined)
y_all = pd.concat(y_combined)

# Handle any remaining NaN values 
X_all = X_all.fillna(X_all.mean())

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Create and train the model
model = HistGradientBoostingRegressor(max_iter=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the trained model
model_filename = 'snow_depth_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

# Example of how to load the model for future use
# loaded_model = joblib.load(model_filename)
# new_predictions = loaded_model.predict(new_data)

Processing data_by_location/Messila_data.csv...
Processing data_by_location/Purnu_data.csv...
Processing data_by_location/Ruunarinteet_data.csv...
Processing data_by_location/Kasurila_data.csv...
Mean Absolute Error: 7.355937742968046
Model saved to snow_depth_model.joblib


In [None]:
def preprocess_new_data(file_path):
    """Preprocess a new dataset using the same steps as training data"""
    df = pd.read_csv(file_path)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['date'], keep='last')
    
    # Handle missing values
    df.replace(to_replace='-', value=np.nan, inplace=True)
    df.snow_depth_cm = df.snow_depth_cm.replace(to_replace='-1', value='0')
    
    # Convert to numeric
    df['avg_temp_c'] = pd.to_numeric(df['avg_temp_c'], errors='coerce')
    df['snow_depth_cm'] = pd.to_numeric(df['snow_depth_cm'], errors='coerce')
    df['uv_index'] = pd.to_numeric(df['uv_index'], errors='coerce')
    
    # Create feature set (same features used in training)
    X = df[['avg_temp_c', 'uv_index', 'cloud_cover_rate']]
    
    # Handle any NaN values (using the same approach as in training)
    X = X.fillna(X.mean())
    
    return df, X

# Load the saved model
loaded_model = joblib.load('snow_depth_model.joblib')

# Example 1: Make predictions on a new dataset
new_data_path = 'data_by_location/Levi_data.csv'
original_df, features = preprocess_new_data(new_data_path)

# Make predictions
predictions = loaded_model.predict(features)

# Add predictions to the original dataframe
original_df['predicted_snow_depth'] = predictions

# Calculate error if actual snow depth is available
if 'snow_depth_cm' in original_df.columns:
    # Drop rows with NaN in snow_depth_cm for evaluation
    eval_df = original_df.dropna(subset=['snow_depth_cm'])
    mae = np.mean(np.abs(eval_df['snow_depth_cm'] - eval_df['predicted_snow_depth']))
    print(f"Mean Absolute Error for {new_data_path}: {mae}")

# Example 2: Make predictions for specific weather conditions
# For a single prediction or custom input
custom_conditions = pd.DataFrame({
    'avg_temp_c': [-5.0],  # Average temperature in Celsius
    'uv_index': [1.5],     # UV index
    'cloud_cover_rate': [0.7]  # Cloud cover rate (0-1)
})

custom_prediction = loaded_model.predict(custom_conditions)
print(f"Predicted snow depth for given conditions: {custom_prediction[0]:.2f} cm")

# Example 3: Batch process multiple datasets
test_datasets = [
    'data_by_location/Ruka_data.csv',
    'data_by_location/Salla_data.csv'
]

for dataset in test_datasets:
    try:
        df, X = preprocess_new_data(dataset)
        predictions = loaded_model.predict(X)
        
        # Optionally save results to CSV
        df['predicted_snow_depth'] = predictions
        output_file = dataset.replace('.csv', '_predictions.csv')
        df.to_csv(output_file, index=False)
        print(f"Predictions for {dataset} saved to {output_file}")
        
    except Exception as e:
        print(f"Error processing {dataset}: {e}")