In [1]:
import os
import libsql
import json
import pandas as pd
from dotenv import load_dotenv

In [2]:
# Read data

df = pd.read_csv('data_processed/full_data_team.csv')
print(df.shape)
df.head(3)

(218, 11)


Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent,I_445,I_515,I_630,I_clear
0,0.0,0.0,0.0,212,13.326,1,96,844,1689,2127,13266
1,0.9,0.5,0.6,212,13.911,1,100,19888,35221,30901,65535
2,1.0,0.8,0.2,212,13.911,1,100,7964,54316,34207,65535


In [3]:
# Split train, val, test
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_X_train = df_train[['R', 'G', 'B', 'day_of_year', 'time_of_day', 'weather_index', 'cloud_cover_percent']]
df_Y_train = df_train[['I_445', 'I_515', 'I_630', 'I_clear']]
df_X_test = df_test[['R', 'G', 'B', 'day_of_year', 'time_of_day', 'weather_index', 'cloud_cover_percent']]
df_Y_test = df_test[['I_445', 'I_515', 'I_630', 'I_clear']]

print(df_X_train.shape)
df_X_train.head(3)

(174, 7)


Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent
84,0.0,0.315789,0.0,210,16.45,2,75
95,0.0,0.894737,0.0,210,16.467,2,75
111,0.684211,0.0,0.0,210,12.217,1,20


In [11]:
os.makedirs("data_split",exist_ok=True)
df_test.to_csv("data_split/test_data.csv", index=False)
df_train.to_csv("data_split/train_data.csv", index=False)

In [4]:
import time 

current_time = time.strftime("%Y%H%M%m%d")
folder_models = f"model{current_time}"
os.makedirs(folder_models, exist_ok=True)

In [5]:
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor

target_columns = df_Y_train.columns
rf_models = {}

# Train each target
for target in target_columns:
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(df_X_train, df_Y_train[target])
    rf_models[target] = model

    # Save the model
    joblib.dump(model, f"{folder_models}/rf_{target}.joblib")

In [6]:
# Make predictions on test set
pred_means = []
pred_stds = []

for target in target_columns:
    model = joblib.load(f"{folder_models}/rf_{target}.joblib")

    all_tree_preds = np.stack([tree.predict(df_X_test.to_numpy()) for tree in model.estimators_], axis=0)

    mean_pred = all_tree_preds.mean(axis=0)
    std_pred = all_tree_preds.std(axis=0)
    pred_means.append(mean_pred)
    pred_stds.append(std_pred)

df_mean = pd.DataFrame(np.vstack(pred_means).T, columns=[f"{col}_pred" for col in target_columns])
df_std = pd.DataFrame(np.vstack(pred_stds).T, columns=[f"{col}_std" for col in target_columns])
df_pred_results = pd.concat([df_X_test.reset_index(drop=True), df_mean, df_std], axis=1)

df_pred_results.head(3)

Unnamed: 0,R,G,B,day_of_year,time_of_day,weather_index,cloud_cover_percent,I_445_pred,I_515_pred,I_630_pred,I_clear_pred,I_445_std,I_515_std,I_630_std,I_clear_std
0,0.105263,0.0,0.0,210,12.2,1,20,767.12,1497.6,6526.814545,13157.93,28.347938,43.165727,1305.43833,1442.733297
1,0.453,0.368,0.462,212,13.533,1,20,15844.208381,25282.135034,15740.75,57007.38,1185.713774,822.470764,1532.908291,5979.160339
2,0.147,0.372,0.394,210,17.617,1,20,12871.358413,25237.466999,7704.485568,43998.924367,20.269204,37.065902,13.633247,135.575115


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate on train data
target_columns = ['I_445', 'I_515', 'I_630', 'I_clear']
df_Y_train_pred = pd.DataFrame(columns=[f'{col}_pred' for col in target_columns])

for target in target_columns:
    model = joblib.load(f"{folder_models}/rf_{target}.joblib")
    
    y_pred = model.predict(df_X_train)
    df_Y_train_pred[f'{target}_pred'] = y_pred

# Evaluate
list_metric_rows = []
for target in target_columns:
    y_true = df_Y_train[target]
    y_pred = df_Y_train_pred[f'{target}_pred']

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    list_metric_rows.append({
        'target': target,
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    })

df_metrics_train = pd.DataFrame(list_metric_rows)
df_metrics_train

Unnamed: 0,target,MAE,MSE,R2
0,I_445,117.801752,35191.19,0.999593
1,I_515,217.405454,136534.6,0.999633
2,I_630,220.763313,118004.2,0.998831
3,I_clear,845.450968,2295230.0,0.992686


In [8]:
# Evaluate on test data
target_columns = ['I_445', 'I_515', 'I_630', 'I_clear']
df_Y_test_pred = pd.DataFrame(columns=[f'{col}_pred' for col in target_columns])

for target in target_columns:
    model = joblib.load(f"{folder_models}/rf_{target}.joblib")
    
    y_pred = model.predict(df_X_test)
    df_Y_test_pred[f'{target}_pred'] = y_pred
    
# Evaluate
list_metric_rows = []
for target in target_columns:
    y_true = df_Y_test[target]
    y_pred = df_Y_test_pred[f'{target}_pred']

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    list_metric_rows.append({
        'target': target,
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    })

df_metrics_test = pd.DataFrame(list_metric_rows)
df_metrics_test

Unnamed: 0,target,MAE,MSE,R2
0,I_445,324.843542,322672.3,0.994774
1,I_515,699.653407,1296509.0,0.996274
2,I_630,648.969098,1156548.0,0.983791
3,I_clear,1259.809997,4329946.0,0.982781


In [9]:
os.makedirs("results", exist_ok=True)
df_pred_results.to_excel("results/pred_test.xlsx", index=False)