In [14]:
import os
import libsql
import json
import pandas as pd
from dotenv import load_dotenv

In [15]:
# Read data

df = pd.read_csv('data_processed/full_data.csv')
print(df.shape)
df.head(3)

(58, 11)


Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent,I_445,I_515,I_630,I_clear
0,0.0,0.0,0.0,212,13.326,1,96,844,1689,2127,13266
1,0.9,0.5,0.6,212,13.911,1,100,19888,35221,30901,65535
2,1.0,0.8,0.2,212,13.911,1,100,7964,54316,34207,65535


In [16]:
# Split train, val, test
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_X_train = df_train[['R', 'G', 'B', 'day_of_year', 'time_of_day', 'weather_index', 'cloud_cover_percent']]
df_Y_train = df_train[['I_445', 'I_515', 'I_630', 'I_clear']]
df_X_test = df_test[['R', 'G', 'B', 'day_of_year', 'time_of_day', 'weather_index', 'cloud_cover_percent']]
df_Y_test = df_test[['I_445', 'I_515', 'I_630', 'I_clear']]

print(df_X_train.shape)
df_X_train.head(3)

(46, 7)


Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent
36,0.7,0.9,0.7,213,13.746,1,34
31,0.3,0.0,0.4,213,13.744,1,34
8,0.1,0.7,1.0,213,13.496,1,37


In [17]:
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor

target_columns = df_Y_train.columns
rf_models = {}

# Train each target
for target in target_columns:
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(df_X_train, df_Y_train[target])
    rf_models[target] = model

    # Save the model
    joblib.dump(model, f"model/rf_{target}.joblib")


In [18]:
# Make predictions
pred_means = []
pred_stds = []

for target in target_columns:
    model = joblib.load(f"model/rf_{target}.joblib")

    all_tree_preds = np.stack([tree.predict(df_X_test.to_numpy()) for tree in model.estimators_], axis=0)

    mean_pred = all_tree_preds.mean(axis=0)
    std_pred = all_tree_preds.std(axis=0)
    pred_means.append(mean_pred)
    pred_stds.append(std_pred)

df_mean = pd.DataFrame(np.vstack(pred_means).T, columns=[f"{col}_pred" for col in target_columns])
df_std = pd.DataFrame(np.vstack(pred_stds).T, columns=[f"{col}_std" for col in target_columns])
df_pred_results = pd.concat([df_X_test.reset_index(drop=True), df_mean, df_std], axis=1)

df_pred_results.head(3)

Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent,I_445_pred,I_515_pred,I_630_pred,I_clear_pred,I_445_std,I_515_std,I_630_std,I_clear_std
0,0.0,0.0,0.0,212,13.326,1,96,2509.42,3360.25,2738.57,23189.83,2707.752364,4025.120295,1065.167923,10854.306146
1,0.1,0.8,0.4,212,13.912,1,100,12235.72,54781.95,6323.85,64426.22,2373.254736,2425.117863,1741.42833,3606.24349
2,1.0,0.3,0.8,213,13.745,1,34,25145.64,21301.78,32546.95,61585.92,1202.124357,1066.919421,588.77521,5757.048047


In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate on train data
target_columns = ['I_445', 'I_515', 'I_630', 'I_clear']
df_Y_train_pred = pd.DataFrame(columns=[f'{col}_pred' for col in target_columns])

for target in target_columns:
    model = joblib.load(f"model/rf_{target}.joblib")
    
    y_pred = model.predict(df_X_train)
    df_Y_train_pred[f'{target}_pred'] = y_pred

# Evaluate
list_metric_rows = []
for target in target_columns:
    y_true = df_Y_train[target]
    y_pred = df_Y_train_pred[f'{target}_pred']

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    list_metric_rows.append({
        'target': target,
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    })

df_metrics_train = pd.DataFrame(list_metric_rows)
df_metrics_train


Unnamed: 0,target,MAE,MSE,R2
0,I_445,208.001522,153630.7,0.99753
1,I_515,382.529565,349935.6,0.999079
2,I_630,224.168696,94015.98,0.999004
3,I_clear,1395.887391,5265911.0,0.9668


In [20]:
# Evaluate on test data
target_columns = ['I_445', 'I_515', 'I_630', 'I_clear']
df_Y_test_pred = pd.DataFrame(columns=[f'{col}_pred' for col in target_columns])

for target in target_columns:
    model = joblib.load(f"model/rf_{target}.joblib")
    
    y_pred = model.predict(df_X_test)
    df_Y_test_pred[f'{target}_pred'] = y_pred
    
# Evaluate
list_metric_rows = []
for target in target_columns:
    y_true = df_Y_test[target]
    y_pred = df_Y_test_pred[f'{target}_pred']

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    list_metric_rows.append({
        'target': target,
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    })

df_metrics_test = pd.DataFrame(list_metric_rows)
df_metrics_test


Unnamed: 0,target,MAE,MSE,R2
0,I_445,865.036667,1527713.0,0.984787
1,I_515,852.145,1137861.0,0.997871
2,I_630,778.704167,1029296.0,0.992276
3,I_clear,5730.634167,62427030.0,0.830483
