In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('dataset/new_dataset.csv')
df['valid_time'] = pd.to_datetime(df['valid_time'])
df.set_index('valid_time', inplace=True)

# Drop unwanted columns
df_clean = df.drop(columns=['latitude', 'longitude', 'number', 'expver'])

# Define columns for Monte Carlo simulation
cols_to_simulate = [
    'u10', 'v10', 'sp',
    'tp', 'skt', 'ssrd', 
    'hcc', 'lcc', 'mcc', 'tcc', 
    'e', 'cp', 'lsp', 
    'ptype', 'sf', 'z'
]

# Prepare storage for simulation results
num_simulations = 1000  # Number of simulations
simulation_results = {col: [] for col in cols_to_simulate}

# Monte Carlo simulation
for col in cols_to_simulate:
    # Fit a normal distribution to the data
    mean = df_clean[col].mean()
    std_dev = df_clean[col].std()
    
    # Generate random samples from the normal distribution
    simulated_values = np.random.normal(loc=mean, scale=std_dev, size=num_simulations)
    simulation_results[col] = simulated_values

# Convert simulation results to DataFrame for easier analysis
simulation_df = pd.DataFrame(simulation_results)

# Calculate MSE for each variable
mse_results = {}
for col in cols_to_simulate:
    # Ensure we have enough actual data for comparison
    actual_values = df_clean[col].values[:num_simulations]  # Match the number of simulations
    mse = mean_squared_error(actual_values, simulation_results[col])
    mse_results[col] = mse

# Print MSE results
print("Mean Squared Error for each variable:")
for col, mse in mse_results.items():
    print(f"{col}: {mse}")


Mean Squared Error for each variable:
u10: 3.1480144875057405
v10: 1.9297425605019203
sp: 231794184.693344
tp: 3.077662069285573e-07
skt: 360.86002358302176
ssrd: 1750242224561.9722
hcc: 0.23249456769384683
lcc: 0.2081191311592575
mcc: 0.14577548818747843
tcc: 0.30334466204357097
e: 3.125806192108874e-08
cp: 1.1665506910928896e-07
lsp: 1.17097886163851e-07
ptype: 3.1738353747718717
sf: 5.7208190755198485e-09
z: 246145756.5042462
