In [None]:
# Standardne biblioteke
import warnings     
import calendar      
import itertools   
from datetime import date   
import pickle
import os
import random
random.seed(42)

# Obrada podataka
import pandas as pd
import numpy as np

# Vizuelizacija
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import seaborn as sns

# Statističke i matematičke funkcije
from scipy import stats
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from scipy.signal import periodogram

# Modelovanje (Prophet)
from prophet import Prophet

# Time series modeliranje
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Machine learning modeli, metrika i evaluacija
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor

# Progres bar za petlje
from tqdm import tqdm

# Lokalni notebook za grafove
%run 00_graphs.ipynb

# Isključenje upozorenja
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
class SarimaPipeline:
    """
    SarimaPipeline klasa za predikciju vremenskih serija polena koristeći:
    - log ili Box-Cox transformaciju,
    - Furijeovu komponente za sezonske obrasce,
    - SARIMA model sa opcionim egzogenim varijablama (npr. meteorološki podaci).
    """
    def __init__(self, transform=None, exog_cols=None, fourier_order=3, value_col="value", date_col="date"):

        assert transform in ['log', 'boxcox', None], "Transformacija mora biti 'log', 'boxcox' ili None."
        self.transform = transform
        self.exog_cols = exog_cols if exog_cols is not None else []  # meteo promenljive
        self.fourier_order = fourier_order # max Furijeov red
        self.value_col = value_col # naziv kolone za vrednost
        self.date_col = date_col # naziv vremenske kolone
        self.rolling_df = None # Evaluacija modela
        self.auto_res = None # Rezultati automatskog podešavanja
        self.fitted_model = None # Fitovani SARIMAX model
        self.summary_text = None # SARIMAX rezime po transformaciji
        self.rolling_dfs = None # Lista rezultata po transformaciji
        self.feature_importances = None # lista SARIMAX rezimea po transformaciji
        self.metrics_df = None # Objedinjene metrike performansi
        self.classification_results = None #Klasifikacija polena po nivoima

        self.order = (0,0,0)
        self.seasonal_order = (0,0,0,0)

    def _apply_transform(self, series):
        """
        Transformacija niz.
        """
        if self.transform == 'log':
            transformed = np.log1p(series/30)
            self.lmbda = None
        elif self.transform == 'boxcox':
            safe_val = series + 1e-1
            transformed, self.lmbda = boxcox(safe_val)
        else:  # None
            transformed = series
            self.lmbda = None
        return transformed

    def _inverse_transform(self, series):
        """
        Inverzna transformacija serije.
        """
        if self.transform == 'log':
            return 30 * np.expm1(series)
        elif self.transform == 'boxcox':
            return inv_boxcox(series, self.lmbda) - 1e-1
        return series

    def _get_period(self):
        """
        Detekcija dominantnog perioda u seriji pomoću periodograma.
        """
        # Dominantni period iz periodograma
        frequencies, power = periodogram(self.data['transform'])
        dominant_idx = np.argmax(power[1:]) + 1
        dominant_period = int(np.round(1 / frequencies[dominant_idx]))

        # Očekivani broj dana trajanja sezone
        start = pd.to_datetime(f'2024-{self.start_md[0]:02d}-{self.start_md[1]:02d}', format='%Y-%m-%d')
        end = pd.to_datetime(f'2024-{self.end_md[0]:02d}-{self.end_md[1]:02d}', format='%Y-%m-%d')

        expected_season_days = (end - start).days + 1

        tolerance = 10
        if abs(dominant_period - expected_season_days) > tolerance:
            dominant_period = expected_season_days

        return dominant_period

    def _get_md(self):
        """
        Detekcija početka i kraja sezone.
        """
        min_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).min()
        max_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).max()
        return min_md, max_md

    def _generate_t(self, dates):
        """
        Generinje linearne komponente
        """
        dates = pd.to_datetime(dates)
        df = pd.DataFrame({'date': dates})
        max_year = df['date'].dt.year.max()
        all_dates = []

        # Kreiranje kompletanog niza svih datuma u sezonama od min_year do max_year
        for year in range(self.min_year, max_year + 1):
            start_date =  pd.Timestamp(year=year, month=self.start_md[0], day=self.start_md[1])
            end_date = pd.Timestamp(year=year, month=self.end_md[0], day=self.end_md[1])
            season_dates = pd.date_range(start=start_date, end=end_date, freq='D')
            all_dates.extend(season_dates.tolist())
        
        # Mapiranje datum -> t
        t_map = {date: idx for idx, date in enumerate(all_dates)}
        t_list = [t_map.get(date, np.nan) for date in df['date']]
        
        return np.array(t_list)

    def _add_fourier_terms(self, K):
        """
        Dodavanje K Furijeovih komponenti kao egzogene promenljive.
        """
        self.data['t'] = self._generate_t(self.data[self.date_col])
        self.features_names = []

        for k in range(1, K + 1):
            cname = f'cos_{k}'
            sname = f'sin_{k}'
            valid_mask = self.data['t'].notna()

            # Izračunavanje cos i sin komponenti
            self.data.loc[valid_mask, cname] = np.cos(2 * np.pi * k * self.data.loc[valid_mask, 't'] / self.period)
            self.data.loc[valid_mask, sname] = np.sin(2 * np.pi * k * self.data.loc[valid_mask, 't'] / self.period)

            for name in [cname, sname]:
                if name not in self.features_names:
                    self.features_names.append(name)

    def _check_exog_cols(self, df):
        """
        Provera da li df sadrži sve exog kolone.
        """
        missing_cols = [col for col in self.exog_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Nedostaju egzogene kolone u df: {missing_cols}")
        
    def _compute_fourier_for_dates(self, dates):
        """
        Generinje Furijeovih redova za listu datuma.
        """
        t = np.array(self._generate_t(dates))
        dct = {}
        dct = {self.date_col: dates}
        for k in range(1, self.fourier_order + 1):
            cos_k = np.full_like(t, np.nan, dtype=np.float64)
            sin_k = np.full_like(t, np.nan, dtype=np.float64)
            valid_mask = ~np.isnan(t)

            # Izračunavanje cos i sin komponenti
            cos_k[valid_mask] = np.cos(2 * np.pi * k * t[valid_mask] / self.period)
            sin_k[valid_mask] = np.sin(2 * np.pi * k * t[valid_mask] / self.period)

            dct[f'cos_{k}'] = cos_k
            dct[f'sin_{k}'] = sin_k

        return pd.DataFrame(dct)
        
    def _set_order(self, order=None, seasonal_order=None):
        """
        Postavljanje (p,d,q) i  (P,D,Q,s) parametre modela.
        """
        if order: self.order = order
        if seasonal_order: self.seasonal_order = seasonal_order

    def _get_next_valid_dates(self, n_days):
        """
        Vraca sledećih n_days validnih datuma od poslednjeg definisanog datuma.
        """
        last_date = self.data[self.date_col].max()
        found = []
        candidate = last_date + pd.Timedelta(days=1)
        while len(found) < n_days:
            md = (candidate.month, candidate.day)
            if self.start_md <= md < self.end_md:
                found.append(candidate)
            candidate += pd.Timedelta(days=1)
        return found

    def _get_valid_dates(self, start_date, end_date):
        """
        Pronalazi sve datume izmedju start_date i end_date koji su u sezoni.
        """
        all_dates = pd.date_range(start=start_date, end=end_date)
        valid_dates = [d for d in all_dates if self.start_md <= (d.month, d.day) < self.end_md]
        return valid_dates

    def _preprocess(self, df):
        """
        Predprocesiranje ulaznog DataFrame-a pre treniranja modela.
        """
        # Provera prisustva svih egzogenih kolona
        self._check_exog_cols(df)

        # Kreiranje kopije ulaznih podataka
        self.data = df.copy()

        # Transformacija target kolone
        self.data['transform'] = self._apply_transform(self.data[self.value_col])

        # Detekcija minimalne godine u podacima (za generisanje t)
        self.min_year = self.data[self.date_col].dt.year.min()

        # Detekcija početnog i krajnjeg meseca/dana u sezoni
        self.start_md, self.end_md = self._get_md()

        # Detekcija dominantnog perioda serije pomoću periodograma
        self.period = self._get_period()

        # Generisanje fourier komponenti za sezonske obrasce
        self._add_fourier_terms(self.fourier_order)

    def fit(self, df, order=None, seasonal_order=None, preprocessed = False, previous_model = None):
        """
        Fitovanje SARIMA model sa prosleđenim parametrima.
        """
        if not preprocessed:
            self._preprocess(df)
            self._set_order(order, seasonal_order)
            endog = self.data['transform']
            exog = self.data[self.features_names+self.exog_cols] if self.features_names+self.exog_cols else None
        else:
            endog = df['transform']
            exog = df[self.features_names+self.exog_cols] if self.features_names+self.exog_cols else None

        # Ako se koristi prethodni fitovani model, uzimamo njegove koeficijente
        if previous_model:
            init_params = self.fitted_model.params
        else:
            init_params = None

        # Inicijalizacija modela
        model = SARIMAX(endog, exog=exog, order=self.order, seasonal_order=self.seasonal_order, enforce_invertibility=False, enforce_stationarity=False, start_params=init_params)
        self.fitted_model = model.fit(disp=False)

        self.summary_text = self.fitted_model.summary().as_text()
        return self.fitted_model
        
    def forecast(self, days=1, exog_df=None):
        """
        Predviđa vrednosti za narednih `days` dana.
        """
        # Generisanje sledećih validnih datuma za predikciju
        forecast_dates = self._get_next_valid_dates(n_days=days)
        exog_future = pd.DataFrame({self.date_col: forecast_dates})

        # Generisanje Furijeovih redova za forecast datume
        if self.fourier_order != 0:
            exog_future = self._compute_fourier_for_dates(forecast_dates)

        # Priprema egzogenih promenljivih ako su definisane
        if self.exog_cols:
            if exog_df is not None:
                self._check_exog_cols(exog_df)
                exog_future = pd.merge(exog_future, exog_df, on=self.date_col, how="left")
            else:
                raise ValueError("exog_df mora biti prosleđen jer su exog_cols definisane.")
        exog_future = exog_future[self.features_names + self.exog_cols] if self.features_names + self.exog_cols else None
        # Predikcija korišćenjem treniranog SARIMA modela
        preds = self.fitted_model.forecast(steps=days, exog=exog_future)
        preds_orig = self._inverse_transform(preds)
        return pd.Series(preds_orig, index=forecast_dates, name='prediction')

    def auto_arima(self, data, end_date=pd.to_datetime('2023-12-31'),
                p_range=range(0, 5), d=0, q_range=range(0, 5),
                P_range=range(0, 2), D=0, Q_range=range(0, 2),
                transforms=None, fourier_orders=None,
                n_combinations=48):
        """
        Nasumična pretraga SARIMA (+Fourier + transformacija) parametara: bira najbolji model po AICC.
        """
        if fourier_orders is None:
            fourier_orders = [self.fourier_order]

        if transforms is None:
            transforms = [self.transform]

        data_origin = data.copy()
        data = data[data[self.date_col] <= end_date]

        results_log = []
        best_score = np.inf
        best_model = None
        best_params = None

        # Kombinacije SARIMA + Fourier parametara
        base_combinations = list(itertools.product(p_range, q_range, P_range, Q_range))
        valid_combinations = [(p, q, P, Q) for (p, q, P, Q) in base_combinations
                            if not ((P >= 1 or Q >= 1) and (p > 2 or q > 2))]
        all_combinations = list(itertools.product(valid_combinations, fourier_orders))
        sampled_combinations = random.sample(all_combinations, min(n_combinations, len(all_combinations)))

        for (p, q, P, Q), fourier_order in tqdm(sampled_combinations, desc="Grid Search"):
            for transform in transforms:
                try:
                    self.transform = transform
                    self.fourier_order = fourier_order

                    self._preprocess(data)
                    s = self.period
                    self._set_order(order=(p, d, q), seasonal_order=(P, D, Q, s))

                    results = self.fit(data)

                    k = len(results.params)
                    n = len(data)
                    if n <= k + 1:
                        continue  # skip nevalidne kombinacije

                    aic = results.aic
                    aicc = aic + (2 * k * (k + 1)) / (n - k - 1)
                    bic = results.bic

                    results_log.append({
                        'order': (p, d, q),
                        'seasonal_order': (P, D, Q, s),
                        'fourier_order': fourier_order,
                        'transform': transform,
                        'aic': aic,
                        'aicc': aicc,
                        'bic': bic
                    })

                    if aicc < best_score:
                        best_score = aicc
                        best_model = results
                        best_params = {
                            'order': (p, d, q),
                            'seasonal_order': (P, D, Q, s),
                            'fourier_order': fourier_order,
                            'transform': transform
                        }
                except Exception as e:
                    print(f"Greška za kombinaciju {(p,d,q),(P,D,Q)} | Fourier: {fourier_order} | Transform: {transform} → {e}")
                    continue

        results_df = pd.DataFrame(results_log)

        if best_model is not None:
            self.fitted_model = best_model
            self._set_order(order=best_params['order'], seasonal_order=best_params['seasonal_order'])
            self.fourier_order = best_params['fourier_order']
            self.transform = best_params['transform']

        self._preprocess(data_origin)
        self.auto_res = results_df
        return results_df

    def evaluate_model(self, start_date, end_date, horizon=30, refit_every=15, low=30, high=100):
        """
        Evaluacija najboljih modela po transformaciji iz auto_arima.
        Radi rolling forecast, računa regresione i klasifikacione metrike.
        """

        if self.auto_res is None or self.auto_res.empty:
            raise ValueError("Nema rezultata iz auto_arima. Pokreni auto_arima() pre evaluate_model().")

        data = self.data.copy()

        self.rolling_dfs = {}      
        self.summary_texts = {}     
        self.metrics_dfs = {}      
        classification_results = {}

        for transform in np.unique(self.auto_res['transform']):
            subset = self.auto_res[self.auto_res['transform'] == transform]
            best_row = subset.loc[subset['aicc'].idxmin()]

            self.transform = best_row['transform']
            self.fourier_order = best_row['fourier_order']
            self._set_order(order=best_row['order'], seasonal_order=best_row['seasonal_order'])

            self._preprocess(data)
            rolling_df = self.rolling_forecast(start_date, end_date, horizon=horizon, refit_every=refit_every)

            self.rolling_dfs[transform] = rolling_df
            self.summary_texts[transform] = self.fitted_model.summary().as_text()

            all_metrics = []
            transform_class_results = {}

            for i in range(horizon):
                col_pred = f"pred{i}d"
                col_actual = f"actual{i}d"
                rolling_df[col_actual] = rolling_df["actual"].shift(-i)

                actual = rolling_df[col_actual]
                pred = rolling_df[col_pred]
                mask = actual.notna() & pred.notna()

                if mask.sum() == 0:
                    continue

                # Regresione metrike
                all_metrics.append({
                    "transform": transform,
                    "forecast_day": i,
                    "MAE": np.mean(np.abs(pred[mask] - actual[mask])),
                    "RMSE": np.sqrt(np.mean((pred[mask] - actual[mask]) ** 2)),
                    "RMSLE": np.sqrt(np.mean((np.log1p(pred[mask] / 30) - np.log1p(actual[mask] / 30)) ** 2))
                })

                # Klasifikacija
                def classify(x):
                    if x < low:
                        return 'low'
                    elif x < high:
                        return 'moderate'
                    else:
                        return 'high'

                actual_cls = actual[mask].apply(classify)
                pred_cls = pred[mask].apply(classify)

                labels = ['low', 'moderate', 'high']
                cm = confusion_matrix(actual_cls, pred_cls, labels=labels)
                report = classification_report(actual_cls, pred_cls, labels=labels, output_dict=True)

                transform_class_results[i] = {
                    "confusion_matrix": cm,
                    "classification_report": report
                }

            self.metrics_dfs[transform] = pd.DataFrame(all_metrics)
            classification_results[transform] = transform_class_results

        self.classification_results = classification_results

        return self.metrics_dfs, self.rolling_dfs, self.summary_texts

    def rolling_forecast(self, start_date, end_date, horizon=1, refit_every=15):
        """
        Rolling forecast evaluacija modela za dat opseg datuma.
        """
        results = []
        data = self.data.copy()
        valid_dates = self._get_valid_dates(start_date, end_date)
        model_fit = None
        count_since_refit = refit_every

        for current_date in tqdm(valid_dates, desc="Rolling forecast"):
            # Kreiranje trening skupa do trenutnog datuma
            train_df = data[data[self.date_col] < current_date].copy()
            # Generisanje future DataFrame-a za predikciju
            future = pd.date_range(start=current_date, periods=horizon)
            future_df = pd.DataFrame({self.date_col: future})
            future_df = pd.merge(future_df, data, on=self.date_col)

            # Inicijalizacija nizova za predikcije i intervale poverenja
            preds = np.zeros(horizon)
            lower_bounds = np.zeros(horizon)
            upper_bounds = np.zeros(horizon)

            if not future_df.empty:
                exog_future_vals = future_df[self.features_names + self.exog_cols] if self.features_names + self.exog_cols else []
                # Refit modela ako je potrebno
                if count_since_refit >= refit_every or model_fit is None:
                    model_fit = self.fit(train_df, preprocessed=True, previous_model=model_fit)
                    count_since_refit = 0

                # Generisanje predikcije i intervala poverenja
                forecast_res = model_fit.get_forecast(steps=len(future_df), exog=exog_future_vals)
                forecast = forecast_res.predicted_mean
                conf_int = forecast_res.conf_int(alpha=0.05)

                for i, (_, row) in enumerate(future_df.iterrows()):
                    idx = (row[self.date_col] - current_date).days
                    if 0 <= idx < horizon:
                        yhat = forecast.iloc[i]
                        lower = conf_int.iloc[i, 0]
                        upper = conf_int.iloc[i, 1]
                        preds[idx] = max(0, round(self._inverse_transform(yhat)))
                        lower_bounds[idx] = max(0, round(self._inverse_transform(lower)))
                        upper_bounds[idx] = max(0, round(self._inverse_transform(upper)))

            actual = data.loc[data[self.date_col] == current_date, self.value_col]
            actual = actual.iloc[0] if not actual.empty else np.nan

            result_row = {
                "date": current_date,
                "actual": actual,
                "prediction": preds[0] if len(preds) > 0 else np.nan
            }
            for i in range(horizon):
                result_row[f"pred{i}d"] = preds[i]
                result_row[f"pred{i}d_lower"] = lower_bounds[i]
                result_row[f"pred{i}d_upper"] = upper_bounds[i]
            results.append(result_row)

            # Ažuriranje modela sa novim podacima bez refitovanja
            y_obs = data.loc[data[self.date_col] == current_date, "transform"]
            x_obs_cols = self.features_names + self.exog_cols
            x_obs = data.loc[data[self.date_col] == current_date, x_obs_cols] if x_obs_cols else None

            if not y_obs.empty:
                model_fit = model_fit.append(
                    endog=y_obs.values,
                    exog=x_obs.values if x_obs_cols else None,
                    refit=False
                )
            count_since_refit += 1
        self.rolling_df = pd.DataFrame(results)
        return self.rolling_df

In [12]:
class ProphetPipeline:
    """
    ProphetPipeline klasa za predikciju vremenskih serija polena koristeći:
    - log ili Box-Cox transformaciju target promenljive,
    - Furijeove komponente za modelovanje sezonskih obrazaca,
    - Prophet model sa opcionim egzogenim varijablama (npr. meteorološki podaci).
    """
    def __init__(self, transform=None, exog_cols=None, value_col="value", date_col="date"):

        assert transform in ['log', 'boxcox', None], "Transformacija mora biti 'log', 'boxcox' ili None."
        self.transform = transform
        self.exog_cols = exog_cols if exog_cols is not None else []  # meteo promenljive
        self.value_col = value_col # naziv kolone za vrednost
        self.date_col = date_col # naziv vremenske kolone
        self.rolling_df = None # Evaluacija modela
        self.auto_res = None # Rezultati automatskog podešavanja
        self.fitted_model = None # Fitovani SARIMAX model
        self.rolling_dfs = None # Lista rezultata po transformaciji
        self.metrics_df = None # Objedinjene metrike performansi
        self.classification_results = None #Klasifikacija polena po nivoima
        
        
        # Inicijalni hyperparametri
        self.changepoint_prior_scale = 0.5 
        self.seasonality_prior_scale = 10
        self.seasonality_mode = 'additive'
        self.changepoint_range = 0.95
        self.mod = 1

    def _apply_transform(self, series):
        """
        Transformacija niz.
        """
        if self.transform == 'log':
            transformed = np.log1p(series/30)
            self.lmbda = None
        elif self.transform == 'boxcox':
            safe_val = series + 1e-1
            transformed, self.lmbda = boxcox(safe_val)
        else:  # None
            transformed = series
            self.lmbda = None
        return transformed

    def _inverse_transform(self, series):
        """
        Inverzna transformacija serije.
        """
        if self.transform == 'log':
            return 30 * np.expm1(series)
        elif self.transform == 'boxcox':
            return inv_boxcox(series, self.lmbda) - 1e-1
        return series

    def _get_md(self):
        """
        Detekcija početka i kraja sezone.
        """
        min_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).min()
        max_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).max()
        return min_md, max_md
    
    def _check_exog_cols(self, df):
        """
        Provera da li df sadrži sve exog kolone.
        """
        missing_cols = [col for col in self.exog_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Nedostaju egzogene kolone u df: {missing_cols}")

    def _get_valid_dates(self, start_date, end_date):
        """
        Pronalazi sve datume izmedju start_date i end_date koji su u sezoni.
        """
        all_dates = pd.date_range(start=start_date, end=end_date)
        valid_dates = [d for d in all_dates if self.start_md <= (d.month, d.day) < self.end_md]
        return valid_dates
    
    def _get_next_valid_dates(self, n_days):
        """
        Vraca sledećih n_days validnih datuma od poslednjeg definisanog datuma.
        """
        last_date = self.data[self.date_col].max()
        found = []
        candidate = last_date + pd.Timedelta(days=1)
        while len(found) < n_days:
            md = (candidate.month, candidate.day)
            if self.start_md <= md < self.end_md:
                found.append(candidate)
            candidate += pd.Timedelta(days=1)
        return found

    def _set_params(self, changepoint_prior_scale=None, seasonality_prior_scale=None, seasonality_mode=None, changepoint_range=None, mod = None):
        """
        Postavljanje hiperparametara Prophet modela.
        """
        if changepoint_prior_scale: self.changepoint_prior_scale = changepoint_prior_scale
        if seasonality_prior_scale: self.seasonality_prior_scale = seasonality_prior_scale
        if seasonality_mode: self.seasonality_mode = seasonality_mode
        if changepoint_range: self.changepoint_range = changepoint_range
        if mod: self.mod=mod

    def _get_mixed_recent_and_seasonal_training_data(self, current_date, window=35, recent_days=50):
        """
        Vraća trening podatke iz sezonskog prozora ±window dana i poslednjih recent_days dana.
        """
        
        ref_doy = current_date.timetuple().tm_yday
        min_doy = ref_doy - window
        max_doy = ref_doy + window
        data = self.data.copy()
        data["doy"] = data["ds"].dt.dayofyear

        seasonal = data[(data["ds"] < current_date) & (data["doy"].between(min_doy, max_doy))]
        recent = data[(data["ds"] < current_date) & (data["ds"] >= current_date - pd.Timedelta(days=recent_days))]

        combined = pd.concat([seasonal, recent]).drop_duplicates().sort_values("ds")
        return combined.drop(columns="doy")
    
    def _preprocess(self, df):
        """
        Predprocesiranje ulaznog DataFrame-a pre treniranja modela.
        """
        # Provera prisustva svih egzogenih kolona
        self._check_exog_cols(df)

        # Kreiranje kopije ulaznih podataka
        self.data = df.copy()

        # Dodavanje kolone 'ds'
        self.data['ds'] = self.data[self.date_col]

        # Transformacija target kolone i čuvanje u 'transform'
        self.data['y'] = self._apply_transform(self.data[self.value_col])

        # Detekcija početnog i krajnjeg meseca/dana u sezoni
        self.start_md, self.end_md = self._get_md()

    def fit(self, data, changepoint_prior_scale=None, seasonality_prior_scale=None, seasonality_mode=None, changepoint_range=None, preprocessed = False):
        """
        Fitovanje Prophet modela sa prosleđenim parametrima.
        """

        if not preprocessed:
            self._preprocess(data)
            self._set_params(changepoint_prior_scale=changepoint_prior_scale, seasonality_prior_scale=seasonality_prior_scale, seasonality_mode=seasonality_mode, changepoint_range=changepoint_range)

        # Inicijalizacija modela
        model = Prophet(
            yearly_seasonality=False,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=self.changepoint_prior_scale,
            changepoint_range=self.changepoint_range,
            seasonality_prior_scale=self.seasonality_prior_scale,
            seasonality_mode=self.seasonality_mode
        )
        model.add_seasonality(name='yearly_custom', period=365.25, fourier_order=10)
        if self.exog_cols:
            for reg in self.exog_cols:
                model.add_regressor(reg)

        if preprocessed:
            self.fitted_model = model.fit(data)
        else:
            self.fitted_model = model.fit(self.data)
        return self.fitted_model

    def forecast(self, days=1, exog_df=None):
        """
        Predviđa vrednosti za narednih `days` dana.
        """
        # Generisanje sledećih validnih datuma za predikciju
        forecast_dates = self._get_next_valid_dates(n_days=days)
        future_df = pd.DataFrame({'ds': forecast_dates})

        # Ako postoje egzogene promenljive, priprema future_df sa njima
        if self.exog_cols:
            if exog_df is None:
                raise ValueError("exog_df mora biti prosleđen jer su exog_cols definisane.")
            self._check_exog_cols(exog_df)

            exog_df = exog_df.copy()
            exog_df['ds'] = pd.to_datetime(exog_df[self.date_col])
            exog_df = exog_df.set_index('ds')
            exog_df = exog_df[self.exog_cols].reset_index()

            future_df = future_df.merge(exog_df, on='ds', how='left')

        # Predikcija korišćenjem treniranog Prophet modela
        forecast = self.fitted_model.predict(future_df)
        preds_orig = self._inverse_transform(forecast['yhat'])
        return pd.Series(preds_orig, index=forecast_dates, name='prediction')

    def grid_search(self, data, param_grid, start_date, end_date, transforms=0):
        """
        Grid search za optimizaciju Prophet hiperparametara koristeći rolling forecast.
        """
        if transforms == 0:
                transforms = [self.transform] 

        grid = list(ParameterGrid(param_grid))
        self.auto_res = {}        

        best_all = []  # lista najboljih za sve transformacije

        for transform in tqdm(transforms, desc="Transformations"):
            
            # Predprocesiranje ulaznih podataka
            self.transform = transform
            self._preprocess(data)

            results = []

            # Iteracija kroz sve kombinacije hiperparametara
            for params in tqdm(grid, desc=f"Grid for {transform}", leave=False):

                self._set_params(
                    changepoint_prior_scale=params.get('changepoint_prior_scale', None),
                    seasonality_prior_scale=params.get('seasonality_prior_scale', None),
                    seasonality_mode=params.get('seasonality_mode', None),
                    changepoint_range=params.get('changepoint_range', None),
                    mod = params.get('mod', None),
                )

                # Rolling forecast evaluacija za trenutni set parametara
                rolling_df = self.rolling_forecast(start_date=start_date, end_date=end_date, horizon=1)
                true = rolling_df['actual'].values
                pred = rolling_df['prediction'].values

                mask = ~np.isnan(true) & ~np.isnan(pred)
                true = true[mask]
                pred = pred[mask]

                # Računanje metrika i čuvanje rezultata
                results.append({
                    'transform': transform,
                    'params': params,
                    'rmse': np.sqrt(np.mean((true - pred) ** 2)),
                    'mae': np.mean(np.abs(true - pred)),
                    'rmsle': np.sqrt(np.mean((np.log1p(true / 30) - np.log1p(pred / 30)) ** 2))
                })

            # Izbor najboljeg po modela
            results_df = pd.DataFrame(results)
            self.auto_res[transform] = results_df

            if not results_df.empty:
                best_result = results_df.sort_values('rmsle').iloc[0]
                best_all.append(best_result)

        # Najbolji hiperparametar svih transformacija
        if best_all:
            best_overall = pd.DataFrame(best_all).sort_values("rmsle").iloc[0]

            self.transform = best_overall['transform']
            self._set_params(**best_overall['params'])

        return self.auto_res
    
    def evaluate_model(self, start_date, end_date, horizon=30, low=30, high=100):
        """
        Evaluacija najboljih Prophet modela po transformaciji.
        Radi rolling forecast, računa regresione i klasifikacione metrike.
        """

        if not isinstance(self.auto_res, dict) or not self.auto_res:
            raise ValueError("auto_res mora biti dict sa rezultatima po transformaciji. Pokreni grid_search() pre evaluate_model().")

        data = self.data.copy()

        self.rolling_dfs = {}       
        self.metrics_dfs = {}      
        classification_results = {}

        for transform, results_df in self.auto_res.items():
            if results_df.empty:
                continue

            best_row = results_df.sort_values('rmsle').iloc[0]

            self.transform = transform
            self._set_params(**dict(best_row['params']))
            self._preprocess(data)

            rolling_df = self.rolling_forecast(start_date, end_date, horizon=horizon)
            self.rolling_dfs[transform] = rolling_df

            all_metrics = []
            transform_class_results = {}

            for i in range(horizon):
                col_pred = f"pred{i}d"
                col_actual = f"actual{i}d"
                rolling_df[col_actual] = rolling_df["actual"].shift(-i)

                actual = rolling_df[col_actual]
                pred = rolling_df[col_pred]
                mask = actual.notna() & pred.notna()

                if mask.sum() == 0:
                    continue

                # Regresione metrike
                all_metrics.append({
                    "transform": transform,
                    "forecast_day": i,
                    "MAE": np.mean(np.abs(pred[mask] - actual[mask])),
                    "RMSE": np.sqrt(np.mean((pred[mask] - actual[mask]) ** 2)),
                    "RMSLE": np.sqrt(np.mean((np.log1p(pred[mask] / 30) - np.log1p(actual[mask] / 30)) ** 2))
                })

                # Klasifikacija
                def classify(x):
                    if x < low:
                        return 'low'
                    elif x < high:
                        return 'moderate'
                    else:
                        return 'high'

                actual_cls = actual[mask].apply(classify)
                pred_cls = pred[mask].apply(classify)

                labels = ['low', 'moderate', 'high']
                cm = confusion_matrix(actual_cls, pred_cls, labels=labels)
                report = classification_report(actual_cls, pred_cls, labels=labels, output_dict=True)

                transform_class_results[i] = {
                    "confusion_matrix": cm,
                    "classification_report": report
                }

            self.metrics_dfs[transform] = pd.DataFrame(all_metrics)
            classification_results[transform] = transform_class_results

        self.classification_results = classification_results

        return self.metrics_dfs, self.rolling_dfs

    def rolling_forecast(self, start_date, end_date, horizon=30):
        """
        Rolling forecast evaluacija modela za dat opseg datuma.
        """
        results = []
        data = self.data.copy()
        valid_dates = self._get_valid_dates(start_date, end_date)

        for current_date in tqdm(valid_dates):
            # Kreiranje trening skupa prema izabranom modu
            if self.mod==0:
                train_df = data[data['ds'] < current_date].copy()
            else:
                train_df = self._get_mixed_recent_and_seasonal_training_data(current_date)
            # Refit modela
            self.fit(train_df, preprocessed=True)

            # Generisanje future DataFrame-a za predikciju
            future = pd.date_range(start=current_date, periods=horizon)
            future_df = pd.DataFrame({"ds": future})
            for reg in self.exog_cols:
                future_df[reg] = data.set_index("ds").reindex(future)[reg].values
            future_df = future_df.dropna(subset=self.exog_cols)

            preds = np.zeros(horizon)
            lower_bounds = np.zeros(horizon)
            upper_bounds = np.zeros(horizon)

            if not future_df.empty:
                forecast = self.fitted_model.predict(future_df)
                forecast['yhat'] = self._inverse_transform(forecast['yhat'])
                forecast['yhat_lower'] = self._inverse_transform(forecast['yhat_lower'])
                forecast['yhat_upper'] = self._inverse_transform(forecast['yhat_upper'])

                for _, rowf in forecast.iterrows():
                    idx = (rowf['ds'] - current_date).days
                    if 0 <= idx < horizon:
                        preds[idx] = np.round(np.clip(rowf['yhat'], 0, None))
                        lower_bounds[idx] = np.clip(rowf['yhat_lower'], 0, None)
                        upper_bounds[idx] = np.clip(rowf['yhat_upper'], 0, None)

            actual = data.loc[data[self.date_col] == current_date, self.value_col]
            actual = actual.iloc[0] if not actual.empty else np.nan

            result_row = {
                "date": current_date,
                "actual": actual,
                "prediction": preds[0] if len(preds) > 0 else np.nan
            }

            for i in range(horizon):
                result_row[f"pred{i}d"] = preds[i] if i < len(preds) else np.nan
                result_row[f"pred{i}d_lower"] = lower_bounds[i] if i < len(lower_bounds) else np.nan
                result_row[f"pred{i}d_upper"] = upper_bounds[i] if i < len(upper_bounds) else np.nan

            results.append(result_row)

        self.rolling_df = pd.DataFrame(results)
        return self.rolling_df

In [None]:
class RandomForestPipeline:
    def __init__(self, transform=None, exog_cols=None, fourier_order=2,  value_col="value", date_col="date"):
        assert transform in ['log', 'boxcox', None], "Transformacija mora biti 'log', 'boxcox' ili None."
        self.transform = transform
        self.exog_cols = exog_cols if exog_cols is not None else []  # meteo promenljive
        self.fourier_order = fourier_order # max Furijeov red
        self.value_col = value_col # naziv kolone za vrednost
        self.date_col = date_col # naziv vremenske kolone
        self.max_lags = 0 # broj maksimalnih kašnjenja u vremenu
        self.rolling_df = None # Evaluacija modela
        self.auto_res = None # Rezultati automatskog podešavanja
        self.fitted_model = None # Fitovani RF model
        self.rolling_dfs = None # Lista rezultata po transformaciji
        self.summary_texts = None # lista Feature Importance po transformaciji
        self.metrics_df = None # Objedinjene metrike performansi
        self.classification_results = None #Klasifikacija polena po nivoima

        self.max_lags=5
        self.n_estimators=100,
        self.max_depth=None
        self.max_features='sqrt'
        self.min_samples_split=2

    def _apply_transform(self, series):
        """
        Transformacija niz.
        """
        if self.transform == 'log':
            transformed = np.log1p(series/30)
            self.lmbda = None
        elif self.transform == 'boxcox':
            safe_val = series + 1e-1
            transformed, self.lmbda = boxcox(safe_val)
        else:  # None
            transformed = series
            self.lmbda = None
        return transformed

    def _inverse_transform(self, series):
        """
        Inverzna transformacija serije.
        """
        if self.transform == 'log':
            return 30 * np.expm1(series)
        elif self.transform == 'boxcox':
            return inv_boxcox(series, self.lmbda) - 1e-1
        return series

    def _get_period(self):
        """
        Detekcija dominantnog perioda u seriji pomoću periodograma.
        """
        # Dominantni period iz periodograma
        frequencies, power = periodogram(self.data['transform'])
        dominant_idx = np.argmax(power[1:]) + 1
        dominant_period = int(np.round(1 / frequencies[dominant_idx]))

        # Očekivani broj dana trajanja sezone
        start = pd.to_datetime(f'2024-{self.start_md[0]:02d}-{self.start_md[1]:02d}', format='%Y-%m-%d')
        end = pd.to_datetime(f'2024-{self.end_md[0]:02d}-{self.end_md[1]:02d}', format='%Y-%m-%d')

        expected_season_days = (end - start).days + 1

        tolerance = 10
        if abs(dominant_period - expected_season_days) > tolerance:
            dominant_period = expected_season_days

        return dominant_period

    def _get_md(self):
        """
        Detekcija početka i kraja sezone.
        """
        min_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).min()
        max_md = self.data[self.date_col].apply(lambda x: (x.month, x.day)).max()
        return min_md, max_md
    
    def _generate_t(self, dates):
        """
        Generinje linearne komponente
        """
        dates = pd.to_datetime(dates)
        df = pd.DataFrame({'date': dates})
        max_year = df['date'].dt.year.max()
        all_dates = []

        # Kreiranje kompletanog niza svih datuma u sezonama od min_year do max_year
        for year in range(self.min_year, max_year + 1):
            start_date =  pd.Timestamp(year=year, month=self.start_md[0], day=self.start_md[1])
            end_date = pd.Timestamp(year=year, month=self.end_md[0], day=self.end_md[1])
            season_dates = pd.date_range(start=start_date, end=end_date, freq='D')
            all_dates.extend(season_dates.tolist())
        
        # Mapiranje datum -> t
        t_map = {date: idx for idx, date in enumerate(all_dates)}
        t_list = [t_map.get(date, np.nan) for date in df['date']]
        
        return np.array(t_list)

    def _add_fourier_terms(self, K):
        """
        Dodavanje K Furijeovih komponenti kao egzogene promenljive.
        """
        self.data['t'] = self._generate_t(self.data[self.date_col])

        for k in range(1, K + 1):
            cname = f'cos_{k}'
            sname = f'sin_{k}'
            valid_mask = self.data['t'].notna()

            # Izračunavanje cos i sin komponenti
            self.data.loc[valid_mask, cname] = np.cos(2 * np.pi * k * self.data.loc[valid_mask, 't'] / self.period)
            self.data.loc[valid_mask, sname] = np.sin(2 * np.pi * k * self.data.loc[valid_mask, 't'] / self.period)

            for name in [cname, sname]:
                if name not in self.features_names:
                    self.features_names.append(name)

    def _check_exog_cols(self, df):
        """
        Provera da li df sadrži sve exog kolone.
        """
        missing_cols = [col for col in self.exog_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Nedostaju egzogene kolone u df: {missing_cols}")
        
    def _compute_fourier_for_dates(self, dates):
        """
        Generinje Furijeovih redova za listu datuma.
        """
        t = np.array(self._generate_t(dates))
        dct = {}
        dct = {self.date_col: dates}
        for k in range(1, self.fourier_order + 1):
            cos_k = np.full_like(t, np.nan, dtype=np.float64)
            sin_k = np.full_like(t, np.nan, dtype=np.float64)
            valid_mask = ~np.isnan(t)

            # Izračunavanje cos i sin komponenti
            cos_k[valid_mask] = np.cos(2 * np.pi * k * t[valid_mask] / self.period)
            sin_k[valid_mask] = np.sin(2 * np.pi * k * t[valid_mask] / self.period)

            dct[f'cos_{k}'] = cos_k
            dct[f'sin_{k}'] = sin_k

        return pd.DataFrame(dct)
        
    def _get_next_valid_dates(self, n_days):
        """
        Vraca sledećih n_days validnih datuma od poslednjeg definisanog datuma.
        """
        last_date = self.data[self.date_col].max()
        found = []
        candidate = last_date + pd.Timedelta(days=1)
        while len(found) < n_days:
            md = (candidate.month, candidate.day)
            if self.start_md <= md < self.end_md:
                found.append(candidate)
            candidate += pd.Timedelta(days=1)
        return found

    def _get_valid_dates(self, start_date, end_date):
        """
        Pronalazi sve datume izmedju start_date i end_date koji su u sezoni.
        """
        all_dates = pd.date_range(start=start_date, end=end_date)
        valid_dates = [d for d in all_dates if self.start_md <= (d.month, d.day) < self.end_md]
        return valid_dates

    def _get_lag_value(self, current_date, lag):
        """
        Vraća vrednost sa određenim vremenskim kašnjenjem u odnosu na zadati datum.
        """
        lag_date = current_date - pd.Timedelta(days=lag)
        value = self.data.loc[self.data[self.date_col] == lag_date, 'transform']
        return value.values[0] if not value.empty else 0

    def _generate_lags(self):
        """
        Generisanje lagova do max_lags
        """
    
        # Brisanje postojećih lagova
        lag_cols = [col for col in self.data.columns if col.startswith('lag_')]
        self.data.drop(columns=lag_cols, inplace=True, errors='ignore')
        self.features_names = [col for col in self.features_names if not col.startswith('lag_')]

        # Generisanje novih lag kolona
        for lag in range(1, self.max_lags + 1):
            col_name = f'lag_{lag}'
            self.data[col_name] = self.data.apply(
                lambda row: self._get_lag_value(row[self.date_col], lag), axis=1
            ).fillna(0)

            self.features_names.append(col_name)

    def _get_rolling_value(self, current_date, window=7):
        """
        Prosečna vrednost za prethodnih 'window' dana.
        """
        # Definisanje početaka i kraja intervala
        start_date = current_date - pd.Timedelta(days=window)
        end_date = current_date - pd.Timedelta(days=1)
        
        # Generisanje liste svih datuma u intervalu
        date_range = pd.date_range(start=start_date, end=end_date)
        
        # Mapiranje datum -> vredonst
        window_df = self.data[self.data[self.date_col].isin(date_range)]
        value_map = dict(zip(window_df[self.date_col], window_df['transform']))
        values = [value_map.get(date, 0) for date in date_range]
        
        return sum(values) / window

    def _get_last_year_value(self, current_date):
        """
        Vrednost za isti datum prethodne godine.
        """
        # Ako je trenutna godina jednaka minimalnoj godini, vraća None
        if current_date.year == self.min_year:
            return None
        
        prev_year_date = current_date - pd.DateOffset(years=1)
        value = self.data.loc[self.data[self.date_col] == prev_year_date, 'transform']
        return value.values[0] if not value.empty else None
    
    def _get_last_year_window(self, current_date, semi_window=3):
        """
        Prosečna vrednost za prozor ±`semi_window` dana oko istog datuma prethodne godine.
        """
        # Ako je trenutna godina jednaka minimalnoj godini, vraća None
        if current_date.year == self.min_year:
            return None

        prev_year_date = current_date - pd.DateOffset(years=1)
        
        # Generisanje liste datuma u prozoru ±semi_window dana
        date_range = pd.date_range(start=prev_year_date - pd.Timedelta(days=semi_window),
                                end=prev_year_date + pd.Timedelta(days=semi_window))
        
        # Mapiranje datum -> vredonst
        window_df = self.data[self.data[self.date_col].isin(date_range)]
        value_map = dict(zip(window_df[self.date_col], window_df['transform']))
        values = [value_map.get(date, 0) for date in date_range]
        
        return sum(values) / len(date_range)

    def _preprocess(self, data):
        """
        Predprocesiranje ulaznog DataFrame-a pre treniranja modela.
        """

        # Provera prisustva svih egzogenih kolona
        self._check_exog_cols(data)

        # Kreiranje kopije ulaznih podataka
        self.data = data.copy()
        
        self.features_names = []

        # Transformacija target kolone
        self.data['transform'] = self._apply_transform(self.data[self.value_col])

        # Detekcija minimalne godine u podacima (za generisanje t)
        self.min_year = self.data[self.date_col].dt.year.min()

        # Detekcija početnog i krajnjeg meseca/dana u sezoni
        self.start_md, self.end_md = self._get_md()

        # Detekcija dominantnog perioda serije pomoću periodograma
        self.period = self._get_period()

        # Generisanje fourier komponenti za sezonske obrasce
        self._add_fourier_terms(self.fourier_order)

        # Broj dana od početka sezone te godine
        self.data['days_from_start'] = self.data.apply(lambda row: (row[self.date_col] - pd.Timestamp(year=row[self.date_col].year, month=self.start_md[0], day=self.start_md[1])).days, axis=1)
        self.features_names.append('days_from_start')

        # Godina     
        self.data['year'] = self.data[self.date_col].dt.year
        self.features_names += ['year']

        # Moving average (7 dana)
        self.data['rolling_7'] = self.data.apply(lambda row: self._get_rolling_value(row[self.date_col], window=7), axis=1).fillna(0)
        self.features_names.append('rolling_7')

        # Isti dan prošle godine
        self.data['last_year'] = self.data.apply(lambda row: self._get_last_year_value(row[self.date_col]), axis=1)
        self.features_names.append('last_year')

        # Prozor +/- 3 dana prošle godine
        self.data['last_year_window'] = self.data.apply(lambda row: self._get_last_year_window(row[self.date_col]), axis=1)
        self.features_names.append('last_year_window')

        # Lagovi 
        self._generate_lags()

        self.data.dropna()

    def _get_lag_value_combined(self, current_date, lag, future_df):
        """
        Vraća vrednost sa određenim vremenskim kašnjenjem u odnosu na zadati datum.
        """
        lag_date = current_date - pd.Timedelta(days=lag)
        value_future = future_df.loc[future_df[self.date_col] == lag_date, 'transform'] if 'transform' in future_df.columns else pd.Series()
        if not value_future.empty:
            return value_future.values[0]
        value_past = self.data.loc[self.data[self.date_col] == lag_date, 'transform']
        return value_past.values[0] if not value_past.empty else 0

    def _get_rolling_value_combined(self, current_date, window=7, future_df=None):
        """
        Prosečna vrednost za prethodnih 'window' dana.
        """
        start_date = current_date - pd.Timedelta(days=window-1)
        past_window = self.data[(self.data[self.date_col] >= start_date) & (self.data[self.date_col] <= current_date)][[self.date_col, 'transform']]
        if future_df is not None and 'transform' in future_df.columns:
            future_window = future_df[(future_df[self.date_col] >= start_date) & (future_df[self.date_col] <= current_date)][[self.date_col, 'transform']]
            combined_window = pd.concat([past_window, future_window]).drop_duplicates(subset=self.date_col).sort_values(self.date_col)
        else:
            combined_window = past_window
        return combined_window['transform'].sum()/window if not combined_window.empty else 0

    def _generate_forecast_row(self, forecast_date, last_known):
        """"
        Generiše jedan red feature-a za predikciju datog forecast_date koristeći last_known DataFrame.
        """
        row = {}
        row[self.date_col] = forecast_date

        # Broj dana od početka sezone te godine
        row['days_from_start'] = (forecast_date - pd.Timestamp(year=forecast_date.year, month=self.start_md[0], day=self.start_md[1])).days

        # Godina
        row['year'] = forecast_date.year

        # Lagovi
        for lag in range(1, self.max_lags + 1):
            col_name = f'lag_{lag}'
            lag_date = forecast_date - pd.Timedelta(days=lag)
            value_future = last_known.loc[last_known[self.date_col] == lag_date, 'transform']
            row[col_name] = value_future.values[0] if not value_future.empty else 0

        # Moving average (7 dana)
        start_date_r7 = forecast_date - pd.Timedelta(days=7)
        end_date_r7 = forecast_date - pd.Timedelta(days=1)
        date_range_r7 = pd.date_range(start=start_date_r7, end=end_date_r7)
        window_df = last_known[last_known[self.date_col].isin(date_range_r7)]
        value_map = dict(zip(window_df[self.date_col], window_df['transform']))
        values = [value_map.get(date, 0) for date in date_range_r7]
        row['rolling_7'] = sum(values) / len(values)

        # Isti dan prošle godine
        row['last_year'] = self._get_last_year_value(forecast_date)

        # Prozor +/- 3 dana prošle godine
        row['last_year_window'] = self._get_last_year_window(forecast_date)

        # Furijeovi redovi
        fourier_df = self._compute_fourier_for_dates([forecast_date])
        row.update(fourier_df.iloc[0].to_dict())

        # Egzogene promenljive
        if self.exog_cols:
            for ex in self.exog_cols:
                ex_value = self.data.loc[self.data[self.date_col] == forecast_date, ex]
                row[ex] = ex_value.values[0] if not ex_value.empty else None

        return row

    def _set_params(self, max_lags=None, n_estimators=None, 
                 max_depth=-1, max_features=None, min_samples_split=None, fourier_order = None):
        """
        Postavljanje hiperparametara Random Forest modela.
        """
        if max_lags: 
            self.max_lags = max_lags
            self._generate_lags()
        if n_estimators: self.n_estimators = n_estimators
        if max_depth!=-1: self.max_depth = max_depth
        if max_features: self.max_features = max_features
        if min_samples_split: self.min_samples_split = min_samples_split
        if fourier_order is not None: self.fourier_order = fourier_order

    def fit(self, df, max_lags=None, n_estimators=None, 
            max_depth=-1, max_features=None, min_samples_split=None, preprocessed = False):
        
        if not preprocessed:
            self._set_params(max_lags, n_estimators, max_depth, max_features, min_samples_split)
            self._preprocess(df)
            X = self.data.dropna()[self.exog_cols + self.features_names]
            y = self.data.dropna()['transform']
        else:
            X = df.dropna()[self.exog_cols + self.features_names]
            y = df.dropna()['transform']

        # Inicijalizacija modela
        model = RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                random_state=42, n_jobs=-1
            )
        self.fitted_model = model.fit(X, y)

    def forecast(self, days=1, exog_df=None):
        """
        Predviđa vrednosti za narednih `days` dana.
        """
        forecast_dates = self._get_next_valid_dates(n_days=days)
        preds = []
        last_known = self.data.copy()

        for forecast_date in forecast_dates:
             # Generisanje feature-e za forecast_date
            row = self._generate_forecast_row(forecast_date, last_known)

            # Dopuni egzogene promenljive iz exog_df
            if self.exog_cols:
                if exog_df is None:
                    raise ValueError("exog_df mora biti prosleđen jer su exog_cols definisane.")
                self._check_exog_cols(exog_df)

                exog_df = exog_df.copy()
                exog_row = exog_df.loc[exog_df[self.date_col] == forecast_date]

                for ex in self.exog_cols:
                    if not exog_row.empty:
                        row[ex] = exog_row.iloc[0][ex]
                    else:
                        row[ex] = None

            # Predikcija
            preds_df = pd.DataFrame([row])
            feature_cols = self.exog_cols + self.features_names
            pred_value = self.fitted_model.predict(preds_df[feature_cols])[0]

            pred_value_orig = self._inverse_transform(pred_value)
            pred_value_orig = max(0, round(pred_value_orig))

            preds.append(pred_value_orig)

            temp_df = pd.DataFrame([{self.date_col: forecast_date, self.value_col: pred_value_orig, 'transform': pred_value}])
            last_known = pd.concat([last_known, temp_df], ignore_index=True)

        # Vraća kao pd.Series sa forecast_dates kao index
        return pd.Series(preds, index=forecast_dates, name='prediction')
    
    def grid_search(self, data, param_grid, start_date, end_date, transforms=0):
        """
        Grid search za optimizaciju Random Forest hiperparametara koristeći rolling forecast.
        Radi kroz više transformacija i uključuje fourier_order.
        """

        if transforms == 0:
            transforms = [self.transform]

        grid = list(ParameterGrid(param_grid))
        self.auto_res = {}                 # Dictionary: {transform: DataFrame sa metrikama}

        data_origin = data.copy()         # Čuvanje originalnih podataka
        best_all = []                     # Lista najboljih rezultata po transformaciji

        for transform in tqdm(transforms, desc="Transformacije"):
            self.transform = transform
            results = []

            for params in tqdm(grid, desc=f"Grid za {transform}", leave=False):

                self._set_params(
                    n_estimators=params.get('n_estimators', None),
                    max_depth=params.get('max_depth', -1),
                    max_features=params.get('max_features', None),
                    min_samples_split=params.get('min_samples_split', None),
                    fourier_order=params.get('fourier_order', None)
                )
                if params.get('max_lags', None):
                    self.max_lags = params.get('max_lags', None)

                # Predprocesiranje ulaznih podataka
                self._preprocess(data)

                # Rolling forecast evaluacija
                rolling_df = self.rolling_forecast(start_date=start_date, end_date=end_date, horizon=1)
                true = rolling_df['actual'].values
                pred = rolling_df['prediction'].values

                mask = ~np.isnan(true) & ~np.isnan(pred)
                true = true[mask]
                pred = pred[mask]

                if len(true) == 0:
                    continue

                results.append({
                    'transform': transform,
                    'params': params,
                    'fourier_order': self.fourier_order,
                    'rmse': np.sqrt(np.mean((true - pred) ** 2)),
                    'mae': np.mean(np.abs(true - pred)),
                    'rmsle': np.sqrt(np.mean((np.log1p(true / 30) - np.log1p(pred / 30)) ** 2))
                })

            results_df = pd.DataFrame(results).sort_values('rmse')

            self.auto_res[transform] = results_df

            if not results_df.empty:
                best_result = results_df.iloc[0]
                best_all.append(best_result)

        # Najbolji model preko svih transformacija
        if best_all:
            best_overall = pd.DataFrame(best_all).sort_values('rmse').iloc[0]

            self.transform = best_overall['transform']
            self._set_params(**dict(best_overall['params']))

        # Vraćanje originalnih podataka i ponovni preprocessing
        self.data = data_origin
        self._preprocess(self.data)

        return self.auto_res

    def evaluate_model(self, start_date, end_date, horizon=30, refit_every=15, low=30, high=100):
        """
        Evaluacija najboljih Random Forest modela po transformaciji iz grid_search.
        Radi rolling forecast, računa regresione i klasifikacione metrike.
        """

        if self.auto_res is None or not isinstance(self.auto_res, dict):
            raise ValueError("Nema rezultata iz grid_search(). Pokreni grid_search() pre evaluate_model().")

        data = self.data.copy()

        self.rolling_dfs = {}                
        self.feature_importances = {}      
        self.metrics_dfs = {}        
        classification_results = {}         

        for transform, df in self.auto_res.items():
            if df.empty:
                continue

            best_row = df.sort_values("rmsle").iloc[0]

            self.transform = best_row['transform']
            self.fourier_order = best_row.get('fourier_order', None)

            self._set_params(**dict(best_row['params']))
            self._preprocess(data)

            rolling_df = self.rolling_forecast(
                start_date=start_date,
                end_date=end_date,
                horizon=horizon,
                refit_every=refit_every
            )
            
            self.rolling_dfs[transform] = rolling_df
    
            importances = pd.Series(self.fitted_model.feature_importances_, index=self.exog_cols + self.features_names)
            self.feature_importances[transform] = importances.sort_values(ascending=False)

            all_metrics = []
            transform_class_results = {}

            for i in range(horizon):
                col_pred = f"pred{i}d"
                col_actual = f"actual{i}d"
                rolling_df[col_actual] = rolling_df["actual"].shift(-i)

                actual = rolling_df[col_actual]
                pred = rolling_df[col_pred]
                mask = actual.notna() & pred.notna()

                if mask.sum() == 0:
                    continue

                # Regresione metrike
                all_metrics.append({
                    "transform": transform,
                    "forecast_day": i,
                    "MAE": np.mean(np.abs(pred[mask] - actual[mask])),
                    "RMSE": np.sqrt(np.mean((pred[mask] - actual[mask]) ** 2)),
                    "RMSLE": np.sqrt(np.mean((np.log1p(pred[mask] / 30) - np.log1p(actual[mask] / 30)) ** 2))
                })

                # Klasifikacija (low/moderate/high)
                def classify(x):
                    if x < low:
                        return 'low'
                    elif x < high:
                        return 'moderate'
                    else:
                        return 'high'

                actual_cls = actual[mask].apply(classify)
                pred_cls = pred[mask].apply(classify)

                labels = ['low', 'moderate', 'high']
                cm = confusion_matrix(actual_cls, pred_cls, labels=labels)
                report = classification_report(actual_cls, pred_cls, labels=labels, output_dict=True)

                transform_class_results[i] = {
                    "confusion_matrix": cm,
                    "classification_report": report
                }

            self.metrics_dfs[transform] = pd.DataFrame(all_metrics)
            classification_results[transform] = transform_class_results

        self.classification_results = classification_results

        return self.metrics_dfs, self.rolling_dfs, self.feature_importances

    
    def rolling_forecast(self, start_date, end_date, horizon=1, refit_every=15):
        """
        Rolling forecast evaluacija modela za dat opseg datuma.
        """
        results = []
        data = self.data.copy()
        valid_dates = self._get_valid_dates(start_date, end_date)
        model_fit = None
        count_since_refit = refit_every

        for current_date in tqdm(valid_dates, desc="Rolling forecast"):
            # Kreiranje trening skupa do trenutnog datuma
            train_df = data[data[self.date_col] < current_date].copy()
            # Generisanje future DataFrame-a za predikciju
            future = pd.date_range(start=current_date, periods=horizon)
            future_df = pd.DataFrame({self.date_col: future})
            future_df = pd.merge(future_df, data, on=self.date_col)

            # Inicijalizacija niza za predikcije 
            preds = np.zeros(horizon)
            last_known = train_df.copy()

            if not future_df.empty:
                # Refit modela ako je potrebno
                if count_since_refit >= refit_every or model_fit is None:
                    model_fit = self.fit(train_df, preprocessed=True)
                    count_since_refit = 0

                for h in range(len(future_df)):
                    forecast_date = current_date + pd.Timedelta(days=h)

                    # Generisanje feature-e za forecast_date
                    row = self._generate_forecast_row(forecast_date, last_known)
                    preds_df = pd.DataFrame([row])

                    feature_cols = self.exog_cols + self.features_names
                    pred_value = self.fitted_model.predict(preds_df[feature_cols])[0]

                    pred_value_orig = self._inverse_transform(pred_value)
                    pred_value_orig = max(0, round(pred_value_orig))

                    temp_df = pd.DataFrame([{self.value_col: pred_value_orig, self.date_col: forecast_date, 'transform': pred_value}])
                    last_known = pd.concat([last_known, temp_df], ignore_index=True)

                    preds[h] = pred_value_orig

            
            actual = data.loc[data[self.date_col] == current_date, self.value_col]
            actual = actual.iloc[0] if not actual.empty else np.nan

            result_row = {
                "date": current_date,
                "actual": actual,
                "prediction": preds[0] if len(preds) > 0 else np.nan
            }
            for i in range(horizon):
                result_row[f"pred{i}d"] = preds[i]
            results.append(result_row)

            count_since_refit += 1

        self.rolling_df = pd.DataFrame(results)
        return self.rolling_df


# Učitavanje podataka

In [3]:
df = pd.read_csv('spatiotemporal_kriging_predictions.csv', parse_dates = ['date'])
weather = pd.read_csv('meteo//meteo_df.csv', parse_dates=['date'])

# Primena SARIMAX modela

In [None]:
meteos = [True, False]
allergens = ['AMBROZIJA', 'JOVA', 'TRAVE']
locations = ['PANČEVO', 'POŽAREVAC', 'KRAGUJEVAC']
for meteo in meteos:
    meteo_str = '' if meteo else '_nometeo'
    exog_cols = ['temperature', 'humidity'] if meteo else []
    for allergen in allergens:
        for location in locations:
            print(f"{allergen} - {location}")

            data = df[(df.allergen == allergen) & (df.location == location)]

            # Spajanje sa vremenskim podacima
            data = pd.merge(left=data, right=weather, how='left', on=['date', 'location'])

            # Kreiranje i treniranje modela
            model = SarimaPipeline(exog_cols = exog_cols)
            res = model.auto_arima(data, transforms = ['log', 'boxcox'], fourier_orders= [0, 3])

            start_date = '2024-01-01'
            end_date = '2024-12-31'

            low = 30 if allergen == 'AMBROZIJA' else 60
            metrics_df = model.evaluate_model(start_date, end_date, horizon=30, refit_every=5, low=low, high=100)

            # Snimanje modela
            model_path = f"models/sarimax/{allergen}_{location}{meteo_str}.pkl"
            os.makedirs(os.path.dirname(model_path), exist_ok=True)

            model.fitted_model = None
            with open(model_path, 'wb') as f:
                pickle.dump(model, f)

            print(f"Snimljen model za: {allergen} - {location}")

# Primena Prophet modela

In [None]:
meteos = [True, False]
allergens = ['AMBROZIJA','JOVA', 'TRAVE']
locations = ['PANČEVO', 'POŽAREVAC', 'KRAGUJEVAC']

for meteo in meteos:
    meteo_str = '' if meteo else '_nometeo'
    exog_cols = ['temperature', 'humidity'] if meteo else []
    for allergen in allergens:
        for location in locations:
            print(f"{allergen} - {location}")

            data = df[(df.allergen == allergen) & (df.location == location)]

            # Spajanje sa vremenskim podacima
            data = pd.merge(left=data, right=weather, how='left', on=['date', 'location'])

            # Kreiranje i treniranje modela
            model = ProphetPipeline(exog_cols = exog_cols)
            param_grid = {
                'changepoint_prior_scale': [0.1, 0.2, 0.5],
                'seasonality_prior_scale': [ 1.0, 2, 5.0],
                'seasonality_mode': ['additive', 'multiplicative'],
                'changepoint_range': [0.6, 0.8, 0.95],
                'mod':[0, 1]
            }

            model.grid_search(data=data, param_grid=param_grid, start_date='2023-01-01', end_date="2023-12-31", transforms=[None, 'boxcox', 'log'])

            start_date = '2024-01-01'
            end_date = '2024-12-31'

            low = 30 if allergen == 'AMBROZIJA' else 60
            metrics_df = model.evaluate_model(start_date, end_date, horizon=30, low=low, high=100)

            # Snimanje modela
            model_path = f"models/Prophet/{allergen}_{location}{meteo_str}.pkl"
            os.makedirs(os.path.dirname(model_path), exist_ok=True)

            model.fitted_model = None
            with open(model_path, 'wb') as f:
                pickle.dump(model, f)

            print(f"Snimljen model za: {allergen} - {location}")


# Primena Random Forest modela

In [None]:
meteos = [True, False]
allergens = ['AMBROZIJA','JOVA', 'TRAVE']
locations = ['PANČEVO', 'POŽAREVAC', 'KRAGUJEVAC']

for meteo in meteos:
    meteo_str = '' if meteo else '_nometeo'
    exog_cols = ['temperature', 'humidity', 'wind', 'precipitation', 'wind_direction'] if meteo else []
    for allergen in allergens:
        for location in locations:
            print(f"{allergen} - {location}")

            data = df[(df.allergen == allergen) & (df.location == location)]

            # Spajanje sa vremenskim podacima
            data = pd.merge(left=data, right=weather, how='left', on=['date', 'location'])

            # Kreiranje i treniranje modela
            model = RandomForestPipeline(exog_cols=exog_cols)
            param_grid = {
                    'n_estimators': [200, 500],
                    'max_depth': [5, 10, None],
                    'max_features': ['log2', 'sqrt'],
                    'min_samples_split': [2, 5],
                    'max_lags': [3, 5],
                    'fourier_order': [0, 3]
            }   

            model.grid_search(data=data, param_grid=param_grid, start_date='2023-01-01', end_date="2023-12-31", transforms=[None, 'boxcox', 'log'])

            start_date = '2024-01-01'
            end_date = '2024-12-31'

            low = 30 if allergen == 'AMBROZIJA' else 60
            metrics_df = model.evaluate_model(start_date, end_date, horizon=30, low=low, high=100)

            # Snimanje modela
            model_path = f"models/RF/{allergen}_{location}{meteo_str}.pkl"
            os.makedirs(os.path.dirname(model_path), exist_ok=True)

            model.fitted_model = None
            with open(model_path, 'wb') as f:
                pickle.dump(model, f)

            print(f"Snimljen model za: {allergen} - {location}")
