In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# -------------------------
# Config / Setări
# -------------------------
SALVEAZA_CSV = True
RANDOM_STATE = 42
ANI = 5  # fereastra istorică pentru macro

# -------------------------
# Utilitare
# -------------------------
def asigura_folder(path):
    os.makedirs(path, exist_ok=True)

# -------------------------
# 1. Descărcare date
# -------------------------
def descarca_btc_si_macro(ani=ANI):
    sfarsit = datetime.today()
    start = sfarsit - timedelta(days=ani*365)

    tickere = {
        'BTC-USD': 'Bitcoin',
        '^GSPC': 'S&P 500',
        'GC=F': 'Aur',
        'CL=F': 'WTI',
        '^VIX': 'VIX',
        'DX-Y.NYB': 'DXY',
        'TIPS': 'TIPS'
    }

    print(f"Se descarcă {len(tickere)} tickere din {start.date()} până în {sfarsit.date()}...")
    raw = yf.download(list(tickere.keys()), start=start, end=sfarsit, interval="1d", progress=False)

    # preferăm 'Adj Close' dacă există
    if 'Adj Close' in raw.columns:
        df_raw = raw['Adj Close'].copy()
    elif 'Close' in raw.columns:
        df_raw = raw['Close'].copy()
    else:
        raise ValueError('Nu există coloana Close sau Adj Close în datele descărcate')

    df_raw.columns = list(tickere.values())
    df_raw = df_raw.sort_index()
    return df_raw

# -------------------------
# 2. Preprocesare & curățare
# -------------------------
def preproceseaza(df):
    df = df.copy()
    macro_cols = [c for c in df.columns if c != 'Bitcoin']

    df = df.dropna(how='all')            # eliminăm rânduri complet goale
    df[macro_cols] = df[macro_cols].ffill()  # propagăm valorile macro
    df = df.dropna(subset=['Bitcoin'])   # eliminăm rânduri fără target
    df = df.apply(pd.to_numeric, errors='coerce')
    return df

# -------------------------
# 3. Eliminare valori aberante (IQR)
# -------------------------
def elimina_outlieri_iqr(df, coloane_numerice=None):
    df = df.copy()
    if coloane_numerice is None:
        coloane_numerice = df.select_dtypes(include=[np.number]).columns.tolist()

    scaler = StandardScaler()
    scaled = pd.DataFrame(scaler.fit_transform(df[coloane_numerice]), index=df.index, columns=coloane_numerice)

    masc_aberante = pd.Series(False, index=scaled.index)
    for c in coloane_numerice:
        Q1 = scaled[c].quantile(0.25)
        Q3 = scaled[c].quantile(0.75)
        IQR = Q3 - Q1
        masc_aberante |= (scaled[c] < Q1 - 1.5*IQR) | (scaled[c] > Q3 + 1.5*IQR)

    df_curat = df.loc[~masc_aberante].copy()
    return df_curat, masc_aberante

# -------------------------
# 4. Feature engineering
# -------------------------
def adauga_caracteristici_btc(df):
    df = df.copy()
    df['BTC_lag1'] = df['Bitcoin'].shift(1)
    df['BTC_lag2'] = df['Bitcoin'].shift(2)
    df['BTC_ma7'] = df['Bitcoin'].rolling(7).mean()
    df['BTC_ma14'] = df['Bitcoin'].rolling(14).mean()
    df['BTC_diff1'] = df['Bitcoin'].diff(1)
    df['BTC_pct_change'] = df['Bitcoin'].pct_change(1)
    df['BTC_vol_3'] = df['BTC_diff1'].rolling(3).std()
    df['BTC_vol_7'] = df['BTC_diff1'].rolling(7).std()
    df = df.dropna()
    return df

# -------------------------
# 5. Split train/test
# -------------------------
def imparte_train_test(X, y, test_size=0.2, shuffle=False):
    return train_test_split(X, y, test_size=test_size, shuffle=shuffle)

# -------------------------
# 6. Evaluare modele clasice
# -------------------------
def evalueaza_modele_clasice(splituri):
    modele = {
        'LinearRegression': LinearRegression(),
        'Lasso': Lasso(alpha=0.01),
        'RandomForest': RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE),
        'XGBoost': XGBRegressor(n_estimators=200, random_state=RANDOM_STATE, eval_metric='rmse'),
        'SVR': SVR(),
        'KNN': KNeighborsRegressor(n_neighbors=5)
    }

    rezultate = []
    for nume_set, split in splituri.items():
        X_train, X_test = split['X_train'], split['X_test']
        y_train, y_test = split['y_train'], split['y_test']
        print(f"\n=== Set de date: {nume_set} ===")
        for nume, model in modele.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rezultate.append({'Set': nume_set, 'Model': nume, 'R2': r2, 'MSE': mse})
            print(f"{nume}: R2={r2:.4f}, MSE={mse:.4f}")

    return pd.DataFrame(rezultate)

# -------------------------
# 7. Multi-target
# -------------------------
def analiza_multi_target(df, coloane_feat, coloane_target):
    tscv = TimeSeriesSplit(n_splits=5)
    scalerX = MinMaxScaler()
    scalerY = MinMaxScaler()

    rezumat = []
    fig, axes = plt.subplots(len(coloane_target), 3, figsize=(18, 4*len(coloane_target)))
    axes = axes.reshape(len(coloane_target), 3)

    for i, target in enumerate(coloane_target):
        X = df[coloane_feat].values
        y = df[target].values
        Xs = scalerX.fit_transform(X)
        ys = scalerY.fit_transform(y.reshape(-1,1)).ravel()

        train_idx, test_idx = list(tscv.split(Xs))[-1]
        X_train, X_test = Xs[train_idx], Xs[test_idx]
        y_train_s, y_test_s = ys[train_idx], ys[test_idx]
        y_test = y[test_idx]

        model = RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE)
        model.fit(X_train, y_train_s)
        y_pred_s = model.predict(X_test)
        y_pred = scalerY.inverse_transform(y_pred_s.reshape(-1,1)).ravel()

        # plot linie
        ax_line = axes[i,0]
        ax_line.plot(df.index[test_idx], y_test, label='Real')
        ax_line.plot(df.index[test_idx], y_pred, label='Pred', linestyle='--')
        ax_line.set_title(f"{target} - Serie temporală")
        ax_line.legend(); ax_line.grid(True)

        # scatter + Pearson
        ax_sc = axes[i,1]
        ax_sc.scatter(y_test, y_pred, alpha=0.6)
        mmin, mmax = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
        ax_sc.plot([mmin,mmax],[mmin,mmax],'r--')
        r = np.corrcoef(y_test, y_pred)[0,1]
        ax_sc.set_title(f"{target} - Scatter (Pearson={r:.3f})")

        # cumsum
        ax_cs = axes[i,2]
        c_r = np.cumsum(y_test)
        c_p = np.cumsum(y_pred)
        ax_cs.plot(df.index[test_idx], c_r, label='Cumsum Real')
        ax_cs.plot(df.index[test_idx], c_p, label='Cumsum Pred', linestyle='--')
        r_cs = np.corrcoef(c_r, c_p)[0,1]
        acc_sign = np.mean(np.sign(y_test[1:]) == np.sign(y_pred[1:]))
        ax_cs.set_title(f"{target} - Cumsum (Pearson={r_cs:.3f}, Acc_sign={acc_sign:.3f})")
        ax_cs.legend(); ax_cs.grid(True)

        rezumat.append({'Target': target, 'Pearson': r, 'Pearson_cumsum': r_cs, 'Acc_sign': acc_sign})

    plt.tight_layout()
    return pd.DataFrame(rezumat)

# -------------------------
# 8. LSTM univariabil
# -------------------------
def creeaza_secvente_univariabile(serie, lookback=30):
    X, y = [], []
    for i in range(len(serie)-lookback):
        X.append(serie[i:i+lookback])
        y.append(serie[i+lookback])
    return np.array(X), np.array(y)

def antreneaza_lstm_univariabil(serie, lookback=30, epochs=20, batch_size=32, verbose=0):
    scaler = MinMaxScaler()
    s = scaler.fit_transform(serie.reshape(-1,1))
    X, y = creeaza_secvente_univariabile(s.flatten(), lookback)
    X = X.reshape((X.shape[0], lookback, 1))

    split = int(len(X)*0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model = Sequential([LSTM(50, input_shape=(lookback,1)), Dense(1)])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)

    y_pred = model.predict(X_test).flatten()
    y_test_res = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()
    y_pred_res = scaler.inverse_transform(y_pred.reshape(-1,1)).flatten()

    rmse = np.sqrt(mean_squared_error(y_test_res, y_pred_res))
    r2 = r2_score(y_test_res, y_pred_res)
    acc_dir = np.mean(np.sign(np.diff(y_test_res)) == np.sign(np.diff(y_pred_res)))
    return {'y_test': y_test_res, 'y_pred': y_pred_res, 'RMSE': rmse, 'R2': r2, 'Acc_dir': acc_dir}

# -------------------------
# 9. LSTM multivariabil multi-horizon
# -------------------------
def antreneaza_lstm_multi_orizont(df, caracteristici, coloana_target='BTC_Close', lookback=30, epochs=30, batch_size=16, horizons=[1,5,7]):
    scalerX = MinMaxScaler()
    scalerY = MinMaxScaler()

    X_all = scalerX.fit_transform(df[caracteristici])
    Y_all = scalerY.fit_transform(df[[coloana_target]])

    def creeaza_seq_multi(X,Y,lb,h):
        Xs, Ys = [], []
        for i in range(len(X)-lb-h+1):
            Xs.append(X[i:i+lb])
            Ys.append(Y[i+lb+h-1])
        return np.array(Xs), np.array(Ys)

    rezultate = {}
    for h in horizons:
        Xs, Ys = creeaza_seq_multi(X_all,Y_all,lookback,h)
        split = int(len(Xs)*0.8)
        X_train, X_test = Xs[:split], Xs[split:]
        y_train, y_test = Ys[:split], Ys[split:]

        model = Sequential([LSTM(64, input_shape=(lookback, X_train.shape[2])), Dropout(0.2), Dense(1)])
        model.compile(optimizer='adam', loss='mse')
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        y_pred_s = model.predict(X_test).flatten()
        y_pred = scalerY.inverse_transform(y_pred_s.reshape(-1,1)).flatten()
        y_real = scalerY.inverse_transform(y_test.reshape(-1,1)).flatten()

        rmse = np.sqrt(mean_squared_error(y_real, y_pred))
        acc_dir = np.mean(np.sign(np.diff(y_real)) == np.sign(np.diff(y_pred)))
        corr = np.corrcoef(y_real, y_pred)[0,1]

        rezultate[h] = {'y_real': y_real, 'y_pred': y_pred, 'RMSE': rmse, 'Acc_dir': acc_dir, 'Pearson': corr}
        print(f"Orizont {h} zile: RMSE={rmse:.3f}, Acc_dir={acc_dir:.3f}, Pearson={corr:.3f}")

    return rezultate

# -------------------------
# 10. Plot multi-scenariu
# -------------------------
def plot_multi_scenariu_continuu(dates, preturi, predictii_dict):
    plt.figure(figsize=(14,6))
    plt.plot(preturi[-100:], label='BTC real', color='black', alpha=0.5)

    culori = {1:'blue',5:'green',7:'red'}
    for h,v in predictii_dict.items():
        y_pred = np.ravel(v['y_pred'])
        last_val = float(np.ravel(preturi)[-1])
        start_idx = len(preturi)-1
        y_plot = np.concatenate(([last_val], y_pred))
        x_plot = range(start_idx, start_idx+len(y_plot))
        plt.plot(x_plot, y_plot, label=f'Predicție +{h} zile', color=culori.get(h,'C0'))

    plt.title("Predicții BTC multi-scenariu (continuu)")
    plt.xlabel("Pași de timp")
    plt.ylabel("BTC Close (USD)")
    plt.legend()
    plt.show()


In [None]:
# -------------------------
# PIPELINE PRINCIPAL
# -------------------------
def main(save_csv=SALVEAZA_CSV):
    # 1. Descărcare date
    df_raw = descarca_btc_si_macro(ANI)

    # 2. Preprocesare
    df = preproceseaza(df_raw)

    # salvare opțională
    if save_csv:
        df.to_csv('btc_merged_clean.csv')
        print('Salvat btc_merged_clean.csv')

    # 3. Eliminare valori aberante
    df_curat, masc_aberante = elimina_outlieri_iqr(df)
    print(f"Rânduri inițiale: {len(df)}, după eliminarea valorilor aberante: {len(df_curat)}")

    # 4. Adăugare caracteristici BTC (pe prețuri originale, nescalate)
    df_feat = df.copy()
    df_feat = adauga_caracteristici_btc(df_feat)

    # 5. Pregătire seturi pentru modele clasice
    coloane_numerice = df_feat.select_dtypes(include=[np.number]).columns.tolist()
    df_scaled = df_feat.copy()
    scaler = StandardScaler()
    df_scaled[coloane_numerice] = scaler.fit_transform(df_feat[coloane_numerice])

    # Dicționar seturi: complet vs curat
    seturi = {'Complet': df_scaled.copy(), 'Curat': df_scaled.loc[~masc_aberante].copy()}

    # 6. Split feature/target și stocare splituri
    coloana_target = 'Bitcoin'
    splituri = {}
    imputer = SimpleImputer(strategy='mean')

    for nume, d in seturi.items():
        coloane_feat = [c for c in d.columns if c != coloana_target]
        X = d[coloane_feat]
        y = d[coloana_target]
        X_imp = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
        X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.2, random_state=RANDOM_STATE)
        splituri[nume] = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

    # 7. Evaluare modele clasice
    rezultate_df = evalueaza_modele_clasice(splituri)
    print('\nRezumat modele clasice:')
    display(rezultate_df.sort_values(['Set','R2'], ascending=[True, False]))

    # 8. Analiză multi-target (folosim df_feat nescalat pentru interpretabilitate)
    coloane_target = ['BTC_lag1','BTC_lag2','BTC_ma7','BTC_ma14','BTC_diff1','BTC_pct_change']
    coloane_feat = [c for c in df_feat.columns if c not in coloane_target]

    rez_multi_target = analiza_multi_target(df_feat, coloane_feat, coloane_target)
    print('\nRezumat multi-target:')
    display(rez_multi_target)

    # 9. LSTM univariabil baseline (exemplu pe BTC_diff1)
    print('\nBaseline LSTM univariabil pe BTC_diff1 (exemplu)')
    if 'BTC_diff1' in df_feat.columns:
        res_uni = antreneaza_lstm_univariabil(df_feat['BTC_diff1'].values, lookback=30, epochs=20)
        print('Rezultate LSTM univariabil:', res_uni['RMSE'], res_uni['R2'], res_uni['Acc_dir'])

    # 10. LSTM multivariabil multi-orizont (1 an BTC)
    print('\nPregătire date pentru LSTM multi-orizont (1 an)')
    sfarsit = datetime.today()
    start = sfarsit - timedelta(days=365)
    btc_df = yf.download('BTC-USD', start=start, end=sfarsit, interval='1d', progress=False)[['Close']].rename(columns={'Close':'BTC_Close'})
    btc_df = btc_df.reset_index()
    btc_df['BTC_diff1'] = btc_df['BTC_Close'].diff()
    btc_df['BTC_pct_change'] = btc_df['BTC_Close'].pct_change()
    btc_df['BTC_ma7'] = btc_df['BTC_Close'].rolling(7).mean()
    btc_df['BTC_ma14'] = btc_df['BTC_Close'].rolling(14).mean()
    btc_df['BTC_vol'] = btc_df['BTC_diff1'].rolling(3).std()
    btc_df = btc_df.dropna().reset_index(drop=True)

    caracteristici = ['BTC_Close','BTC_ma7','BTC_ma14','BTC_diff1','BTC_pct_change','BTC_vol']
    rez_lstm_multi = antreneaza_lstm_multi_orizont(
    btc_df[caracteristici],
    caracteristici,
    coloana_target='BTC_Close',
    lookback=30,
    epochs=30,
    horizons=[1,5,7]
    )

    # 11. Plot predicții continue ancorate pe ultima valoare reală
    history_dates = btc_df['Date'].values[-100:]
    history_prices = btc_df['BTC_Close'].values[-100:]
    preds_plot = {h:{'y_pred':v['y_pred']} for h,v in rez_lstm_multi.items()}
    plot_multi_scenariu_continuu(history_dates, history_prices, preds_plot)

    # 12. Salvare rezultate și artefacte
    asigura_folder('outputs')
    if save_csv:
        rezultate_df.to_csv('outputs/rezultate_modele_clasice.csv', index=False)
        rez_multi_target.to_csv('outputs/rezumat_multi_target.csv', index=False)
        pd.DataFrame([{ 'orizont':h, **{k:v for k,v in rez_lstm_multi[h].items() if k in ['RMSE','Acc_dir','Pearson']} } for h in rez_lstm_multi]).to_csv('outputs/lstm_multi_orizont.csv', index=False)
        print('Rezultatele au fost salvate în folderul outputs/')

    print('\nPipeline finalizat.')

if __name__ == '__main__':
    main()