import


In [15]:
import sys
import os
import platform

print("üîç Controllo ambiente Python / Colab / PyTorch...")
print("--------------------------------------------------")

# VERSIONE PYTHON
py_ver = sys.version.split(" ")[0]
major, minor, _ = py_ver.split(".")

print(f"‚û°Ô∏è Versione Python rilevata: {py_ver}")

# CONTROLLO VERSIONE COMPATIBILE PYTORCH
supported_minor = ["8", "9", "10", "11"]  # Python 3.8‚Äì3.11
is_supported = minor in supported_minor

if not is_supported:
    print("\n‚ùå PyTorch NON √® compatibile con Python 3.12.")
    print("‚ùå Colab ha aggiornato il runtime a Python 3.12, che PyTorch NON supporta.")
    print("\nüî• Soluzione: usa Colab Legacy (Python 3.10):")
    print("üëâ https://colab.research.google.com/?runtime=python3.10")
    print("\nüìå Dopo aver aperto Colab Legacy, installa PyTorch con:")
    print("!pip install torch==2.2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
else:
    print("\n‚úÖ Puoi installare PyTorch normalmente.")
    print("Esempio (GPU):")
    print("!pip install torch==2.2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
    print("\nOppure (CPU only):")
    print("!pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cpu")

    #------------

from pathlib import Path
import numpy as np
import pandas as pd
from dateutil import parser

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


try:
    NOTEBOOK_DIR = Path(__file__).resolve().parent
except NameError:
    NOTEBOOK_DIR = Path.cwd().resolve()

def find_project_root(start: Path) -> Path:
    for path in [start, *start.parents]:
        raw = path / "raw_data"
        processed = path / "processed"
        if raw.exists() and processed.exists():
            return path
    raise FileNotFoundError(f"raw_data/processed non trovate partendo da {start}")

PROJECT_ROOT = Path("/content/pv_project")
DATA_DIR = PROJECT_ROOT / "raw_data"
PROCESSED_DIR = PROJECT_ROOT / "processed"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
PROJECT_ROOT = Path("/content/PV_power-forecasting")


print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)






üîç Controllo ambiente Python / Colab / PyTorch...
--------------------------------------------------
‚û°Ô∏è Versione Python rilevata: 3.12.12

‚ùå PyTorch NON √® compatibile con Python 3.12.
‚ùå Colab ha aggiornato il runtime a Python 3.12, che PyTorch NON supporta.

üî• Soluzione: usa Colab Legacy (Python 3.10):
üëâ https://colab.research.google.com/?runtime=python3.10

üìå Dopo aver aperto Colab Legacy, installa PyTorch con:
!pip install torch==2.2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


RuntimeError: Trying to override a python impl for DispatchKey.Meta on operator aten::broadcast_tensors

preprocessing

In [14]:


# ============================================================
# 1. LOAD PV
# ============================================================

def load_pv():
    pv_excel = DATA_DIR / "pv_dataset.xlsx"

    pv1 = pd.read_excel(pv_excel, sheet_name=0)
    pv2 = pd.read_excel(pv_excel, sheet_name=1)
    pv = pd.concat([pv1, pv2], ignore_index=True)

    timestamp_col = "Max kWp"
    value_col = [c for c in pv.columns if c != timestamp_col][0]

    pv = pv.rename(columns={
        timestamp_col: "pv_date",
        value_col: "Production_KWh"
    })

    pv["pv_date"] = pd.to_datetime(pv["pv_date"], format="mixed", errors="coerce")
    pv = pv.sort_values("pv_date").reset_index(drop=True)

    return pv[["pv_date", "Production_KWh"]]


# ============================================================
# 2. LOAD WX
# ============================================================

def load_wx():
    wx_excel = DATA_DIR / "wx_dataset.xlsx"

    wx1 = pd.read_excel(wx_excel, sheet_name=0)
    wx2 = pd.read_excel(wx_excel, sheet_name=1)
    wx = pd.concat([wx1, wx2], ignore_index=True)

    wx = wx.rename(columns={
        "dt_iso": "wx_date",
        "temp": "temperature"
    })

    def safe_parse(x):
        try:
            dt = parser.parse(str(x))
            if dt.tzinfo is not None:
                dt = dt.replace(tzinfo=None)
            return dt
        except:
            return pd.NaT

    wx["wx_date"] = wx["wx_date"].apply(safe_parse)
    wx["wx_date"] = pd.to_datetime(wx["wx_date"], errors="coerce")
    wx = wx.dropna(subset=["wx_date"])
    wx = wx.sort_values("wx_date").reset_index(drop=True)

    return wx


# ============================================================
# 3. FIX WX TIMELINE
# ============================================================

def fix_daily_wx_timeline(wx: pd.DataFrame, start_date) -> pd.DataFrame:
    wx = wx.sort_values("wx_date").reset_index(drop=True)

    start = pd.to_datetime(start_date, errors="coerce")
    start = start.replace(minute=0, second=0, microsecond=0)

    n = len(wx)
    new_dates = pd.date_range(start=start, periods=n, freq="h")

    wx = wx.copy()
    wx["wx_date"] = new_dates
    return wx


# ============================================================
# 4. WEATHER CATEGORY
# ============================================================

def categorize_weather(wx):
    def map_weather(desc):
        d = str(desc).lower()

        if "broken" in d or "overcast" in d:
            return "cloudy"
        if "scattered" in d or "few" in d:
            return "partly_cloudy"
        if "light rain" in d or "moderate rain" in d:
            return "rain"
        if "clear sky" in d or d.strip() == "clear":
            return "clear"
        return "other"

    wx["weather_category"] = wx["weather_description"].apply(map_weather)
    return wx


# ============================================================
# 5. ONE-HOT ENCODING
# ============================================================

def one_hot_encode_weather(wx):
    dummies = pd.get_dummies(wx["weather_category"], prefix="weather")

    expected = [
        "weather_cloudy",
        "weather_partly_cloudy",
        "weather_rain",
        "weather_clear",
        "weather_other"
    ]
    for col in expected:
        if col not in dummies:
            dummies[col] = 0

    return pd.concat([wx, dummies[expected]], axis=1)


# ============================================================
# 6. DROP UNUSED WX COLUMNS
# ============================================================

def drop_unused_columns(wx):
    cols = [
        "lat", "lon", "dew_point", "pressure",
        "weather_description"
    ]
    return wx.drop(columns=[c for c in cols if c in wx.columns])


# ============================================================
# 7. TIME FEATURES + LAG + ROLLING
# ============================================================

def add_time_features(df, date_col):
    dt = pd.to_datetime(df[date_col], errors="coerce")

    df["month"] = dt.dt.month
    df["day"] = dt.dt.day
    df["hour"] = dt.dt.hour
    df["day_of_week"] = dt.dt.dayofweek
    df["is_weekend"] = df["day_of_week"].isin([5,6]).astype(int)
    df["day_of_year"] = dt.dt.dayofyear

    df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
    df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)
    df["doy_sin"] = np.sin(2*np.pi*df["day_of_year"]/366)
    df["doy_cos"] = np.cos(2*np.pi*df["day_of_year"]/366)
    return df


def add_lag_features(df, columns, lags):
    df = df.copy()
    for col in columns:
        for lag in lags:
            df[f"{col}_lag_{lag}h"] = df[col].shift(lag)
    return df


def add_rolling_features(df, columns, windows):
    df = df.copy()
    for col in columns:
        for w in windows:
            df[f"{col}_roll_mean_{w}h"] = df[col].rolling(w).mean()
            df[f"{col}_roll_max_{w}h"] = df[col].rolling(w).max()
            df[f"{col}_roll_min_{w}h"] = df[col].rolling(w).min()
            df[f"{col}_roll_std_{w}h"] = df[col].rolling(w).std()
    return df


def add_weather_combinations(df):
    df = df.copy()
    ghi_col = next((c for c in ["GHI","Ghi","ghi"] if c in df.columns), None)

    if ghi_col and "weather_cloudy" in df.columns:
        df["effective_radiation"] = df[ghi_col] * (1 - df["weather_cloudy"])
    if ghi_col and "temperature" in df.columns:
        df["ghi_temp_ratio"] = df[ghi_col] / df["temperature"].replace(0, np.nan)
    if "humidity" in df.columns and "temperature" in df.columns:
        df["rh_temp_product"] = df["humidity"] * df["temperature"]
    if "wind_speed" in df.columns and "temperature" in df.columns:
        df["wind_chill_effect"] = df["wind_speed"] * df["temperature"]

    return df


# ============================================================
# 8. MERGE
# ============================================================

def nuovo_dataset(pv, wx):
    df = pd.DataFrame()
    df["pv_date"] = pv["pv_date"]
    df["Production_KWh"] = pv["Production_KWh"]

    for col in ["temperature", "humidity", "Ghi", "Dhi", "Dni", "wind_speed", "clouds_all"]:
        if col in wx.columns:
            df[col] = wx[col]

    for col in ["weather_cloudy","weather_partly_cloudy","weather_rain","weather_clear","weather_other"]:
        df[col] = wx[col]

    for col in ["hour_sin","hour_cos","doy_sin","doy_cos"]:
        df[col] = pv[col]

    return df


# ============================================================
# 9. BUILD MERGED DATASET (main function)
# ============================================================

def build_merged_dataset():
    pv = load_pv()
    wx = load_wx()

    wx = categorize_weather(wx)
    wx = one_hot_encode_weather(wx)
    wx = drop_unused_columns(wx)

    wx = fix_daily_wx_timeline(wx, start_date=pv["pv_date"].iloc[0])
    pv = add_time_features(pv, date_col="pv_date")

    merged = nuovo_dataset(pv, wx)

    merged = add_lag_features(merged, ["Production_KWh"], [1,12,24])
    merged = add_rolling_features(merged, ["Production_KWh"], [3,6,24])
    merged = add_weather_combinations(merged)

    history = 24
    merged = merged.iloc[history:].reset_index(drop=True)

    out = OUTPUT_DIR / "merged_dataset_final.xlsx"
    merged.to_excel(out, index=False)
    print("‚úì merged_dataset_final.xlsx salvato.")
    return merged

cosa

In [None]:
# ==========================================
# OUTLIER HANDLING
# ==========================================

from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_ROOT = Path(".")
PROCESSED_DIR = PROJECT_ROOT / "processed"
PROCESSED_DIR.mkdir(exist_ok=True)

MERGED_CLEAN_UNIFIED = PROCESSED_DIR / "merged_dataset_final.xlsx"
OUTPUT_INTERPOLATED = PROCESSED_DIR / "merged_dataset_final_interpolated.xlsx"


def find_outliers(df, column="Production_KWh", ghi_column="Ghi",
                  q_factor=1.20, ghi_ratio=0.25):

    prod = df[column]
    has_ghi = ghi_column in df.columns
    ghi = df[ghi_column] if has_ghi else None

    q75 = prod.quantile(0.75)
    threshold_q = q75 * q_factor

    mask = prod > threshold_q
    if has_ghi:
        mask |= (prod > ghi * ghi_ratio)

    df_outliers = df[mask]
    return df_outliers, mask


def interpolate_point_by_point(df, mask, column="Production_KWh"):
    df_fixed = df.copy()
    idxs = np.where(mask)[0]

    for i in idxs:
        prev = df_fixed[column].iloc[i-1] if i > 0 else None
        nex  = df_fixed[column].iloc[i+1] if i < len(df_fixed)-1 else None

        if prev is not None and nex is not None:
            df_fixed.at[i, column] = (prev + nex) / 2
        elif prev is not None:
            df_fixed.at[i, column] = prev
        elif nex is not None:
            df_fixed.at[i, column] = nex

    return df_fixed

split

In [None]:
# ==========================================
# SPLIT AND SCALE
# ==========================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def split_and_scale(df, target_col="Production_KWh",
                    time_cols=["hour_sin","hour_cos","doy_sin","doy_cos"],
                    train_ratio=0.6, val_ratio=0.2):

    n = len(df)
    train_size = int(n * train_ratio)
    val_size = int(n * val_ratio)

    train_df = df.iloc[:train_size]
    val_df   = df.iloc[train_size : train_size + val_size]
    test_df  = df.iloc[train_size + val_size :]

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [c for c in df.columns if c not in {target_col,"pv_date"}]
    scale_cols = [c for c in feature_cols if c in numeric_cols and c not in time_cols]

    feature_scaler = StandardScaler()
    target_scaler  = StandardScaler()

    train_scaled_block = feature_scaler.fit_transform(train_df[scale_cols])
    val_scaled_block   = feature_scaler.transform(val_df[scale_cols])
    test_scaled_block  = feature_scaler.transform(test_df[scale_cols])

    y_train = target_scaler.fit_transform(train_df[[target_col]])
    y_val   = target_scaler.transform(val_df[[target_col]])
    y_test  = target_scaler.transform(test_df[[target_col]])

    def rebuild(df_orig, scaled_block, y_scaled):
        out = df_orig.copy()
        out[scale_cols] = scaled_block
        out[target_col] = y_scaled
        return out

    return (
        rebuild(train_df, train_scaled_block, y_train),
        rebuild(val_df, val_scaled_block, y_val),
        rebuild(test_df, test_scaled_block, y_test),
        feature_scaler,
        target_scaler
    )

dataloader

In [None]:
# ==========================================
# DATALOADER
# ==========================================

import torch
from torch.utils.data import Dataset, DataLoader

class GRUDirectDataset(Dataset):
    def __init__(self, df,
                 input_length=168,
                 forecast_horizon=24,
                 target_col="Production_KWh",
                 encoder_cols=None,
                 future_time_cols=None):

        self.df = df
        self.input_length = input_length
        self.forecast_horizon = forecast_horizon
        self.target_col = target_col
        self.encoder_cols = encoder_cols
        self.future_time_cols = future_time_cols

    def __len__(self):
        return len(self.df) - self.input_length - self.forecast_horizon

    def __getitem__(self, idx):
        x_enc = self.df.iloc[idx : idx+self.input_length][self.encoder_cols].values.astype("float32")

        x_future = self.df.iloc[
            idx+self.input_length : idx+self.input_length+self.forecast_horizon
        ][self.future_time_cols].values.astype("float32")

        y = self.df.iloc[
            idx+self.input_length : idx+self.input_length+self.forecast_horizon
        ][self.target_col].values.astype("float32")

        return torch.tensor(x_enc), torch.tensor(x_future), torch.tensor(y)


def create_dataloader(df, encoder_cols, future_time_cols,
                      input_length=168, forecast_horizon=24,
                      batch_size=32, shuffle=True,
                      target_col="Production_KWh"):

    dataset = GRUDirectDataset(
        df=df,
        input_length=input_length,
        forecast_horizon=forecast_horizon,
        encoder_cols=encoder_cols,
        future_time_cols=future_time_cols,
        target_col=target_col
    )

    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

gru

In [None]:
# ==========================================
# GRU MODEL + TRAINING
# ==========================================

import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from split import split_and_scale
from dataloader import create_dataloader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class GRUDirectModel(nn.Module):
    def __init__(self, 
                 enc_in_dim,
                 future_time_dim=4,
                 hidden_dim=128,
                 num_layers=2,
                 forecast_horizon=24):
        super().__init__()

        self.forecast_horizon = forecast_horizon

        self.encoder = nn.GRU(
            input_size=enc_in_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim + future_time_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x_enc, x_future):
        _, h = self.encoder(x_enc)
        h_last = h[-1]

        h_rep = h_last.unsqueeze(1).repeat(1, self.forecast_horizon, 1)
        fused = torch.cat([h_rep, x_future], dim=-1)

        return self.mlp(fused).squeeze(-1)


criterion = nn.MSELoss()

def train_one_epoch(model, loader, optimizer):
    model.train()
    tot = 0
    for x_enc, x_fut, y in loader:
        x_enc, x_fut, y = x_enc.to(device), x_fut.to(device), y.to(device)

        optimizer.zero_grad()
        pred = model(x_enc, x_fut)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        tot += loss.item()
    return tot / len(loader)


def eval_one_epoch(model, loader):
    model.eval()
    tot = 0
    with torch.no_grad():
        for x_enc, x_fut, y in loader:
            x_enc, x_fut, y = x_enc.to(device), x_fut.to(device), y.to(device)
            pred = model(x_enc, x_fut)
            tot += criterion(pred, y).item()
    return tot / len(loader)