### Importazione librerie

In [215]:
import ccxt
import pandas as pd
from datetime import datetime
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Per modelli di machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Esempio con un modello di reti neurali (Keras)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

import lightgbm as lgb

### Scaricamento dati

In [216]:
def fetch_crypto_data(market, timeframe, symbols, start_date, end_date):
    # Configura l'exchange
    try:
        exchange = getattr(ccxt, market)()
        exchange.load_markets()
        print(f"[INFO] Collegato all'exchange {market}.")
    except Exception as e:
        print(f"[ERROR] Errore nella connessione all'exchange {market}: {e}")
        return

    # Trasforma le date in timestamp
    try:
        start_timestamp = int(pd.Timestamp(start_date).timestamp() * 1000)
        end_timestamp = int(pd.Timestamp(end_date).timestamp() * 1000)
        print(f"[INFO] Periodo scelto: da {start_date} a {end_date}.")
    except Exception as e:
        print(f"[ERROR] Errore nella conversione delle date: {e}")
        return

    # Lista per i dati scaricati
    all_data = {}

    for symbol in symbols:
        pair = f"{symbol}/USDT"
        if pair not in exchange.symbols:
            print(f"[WARNING] {pair} non è disponibile su {market}.")
            continue

        print(f"[INFO] Inizio il download dei dati per {pair} con timeframe {timeframe}.")

        ohlcv = []
        since = start_timestamp

        # Scarica i dati in blocchi
        while since < end_timestamp:
            try:
                batch = exchange.fetch_ohlcv(pair, timeframe, since)
                if not batch:
                    print(f"[INFO] Nessun dato aggiuntivo trovato per {pair}.")
                    break
                ohlcv.extend(batch)
                since = batch[-1][0] + 1  # Avanza al prossimo blocco
                print(f"[INFO] Scaricati {len(batch)} record per {pair} (fino a {datetime.utcfromtimestamp(batch[-1][0] / 1000)}).")
                time.sleep(exchange.rateLimit / 1000)  # Rispetta il rate limit
            except Exception as e:
                print(f"[ERROR] Errore durante il download dei dati per {pair}: {e}")
                break

        if not ohlcv:
            print(f"[WARNING] Nessun dato trovato per {pair}.")
            continue

        # Crea un DataFrame dai dati
        df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df.set_index('timestamp', inplace=True)
        all_data[pair] = df
        print(f"[INFO] Dati per {pair} scaricati con successo.")

    print("[INFO] Download completato.")
    return all_data

In [217]:
# Esempio di utilizzo
market_name = "bybit"  # Exchange
timeframe_download = '1h'  # Timeframe (ad esempio: '1m', '1h', '1d')
symbols_list = ['BTC']  # Lista delle criptovalute
start_date = '2022-01-01'  # Data di inizio
end_date = '2024-12-27'  # Data di fine

crypto_data = fetch_crypto_data(market_name, timeframe_download, symbols_list, start_date, end_date)
crypto_data

[INFO] Collegato all'exchange bybit.
[INFO] Periodo scelto: da 2022-01-01 a 2024-12-27.
[INFO] Inizio il download dei dati per BTC/USDT con timeframe 1h.
[INFO] Scaricati 200 record per BTC/USDT (fino a 2022-01-09 07:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-01-17 14:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-01-25 21:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-02-03 04:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-02-11 11:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-02-19 18:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-02-28 01:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-03-08 08:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-03-16 15:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-03-24 22:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-04-02 05:00:00).
[INFO] Scaricati 199 record per BTC/USDT (fino a 2022-04-10 12:00

{'BTC/USDT':                          open      high       low     close      volume
 timestamp                                                              
 2022-01-01 00:00:00  46198.56  46739.64  46198.56  46656.12   79.096779
 2022-01-01 01:00:00  46656.12  46939.20  46574.66  46779.46   75.932256
 2022-01-01 02:00:00  46779.46  46944.60  46732.37  46805.99   51.650896
 2022-01-01 03:00:00  46805.99  46893.85  46759.27  46814.07   37.269293
 2022-01-01 04:00:00  46814.07  46872.58  46629.43  46708.83   46.730933
 ...                       ...       ...       ...       ...         ...
 2024-12-29 09:00:00  95123.09  95213.38  95041.00  95177.69  224.981250
 2024-12-29 10:00:00  95177.69  95282.80  95090.57  95155.99  350.332240
 2024-12-29 11:00:00  95155.99  95179.77  95010.06  95027.14  201.290616
 2024-12-29 12:00:00  95027.14  95141.26  94893.26  94905.47  517.712562
 2024-12-29 13:00:00  94905.47  95012.89  94827.93  95002.97  291.787333
 
 [26246 rows x 5 columns]}

In [218]:
dati = pd.DataFrame({'Timestamp': crypto_data[symbols_list[0] + '/USDT'].index})
for crypto in symbols_list:
    dati['Price'] = crypto_data[crypto + '/USDT']['close'].values

# Filtra il dataframe per includere solo le date fino a end_date
dati = dati[dati['Timestamp'] <= pd.Timestamp(end_date)]
    
dati

Unnamed: 0,Timestamp,Price
0,2022-01-01 00:00:00,46656.12
1,2022-01-01 01:00:00,46779.46
2,2022-01-01 02:00:00,46805.99
3,2022-01-01 03:00:00,46814.07
4,2022-01-01 04:00:00,46708.83
...,...,...
26180,2024-12-26 20:00:00,95591.40
26181,2024-12-26 21:00:00,95812.04
26182,2024-12-26 22:00:00,95812.73
26183,2024-12-26 23:00:00,95783.86


## Inizio lavori

### Copia del dataframe e conversione timestamp

In [219]:
df = dati.copy()

# Assicuriamoci che il Timestamp sia in datetime (se non lo è già)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') 
# Se il tuo Timestamp non è in secondi, regola l'argomento 'unit' o ometti 'unit'

# Ordiniamo il dataframe in base al tempo
df.sort_values('Timestamp', inplace=True)
df.reset_index(drop=True, inplace=True)

### Creazione del target (label massimi/minimi)

In [220]:
def find_local_extrema(prices):
    """
    Dato un array di prezzi, ritorna un array di stringhe:
      'max' per massimo locale
      'min' per minimo locale
      '-'   altrimenti
    """
    labels = np.array([None]*len(prices), dtype=object)
    
    # Per evitare di sforare agli estremi, partiamo da t=1 e arriviamo fino a len(prices) - 1
    for t in range(1, len(prices) - 1):
        if prices[t] > prices[t - 1] and prices[t] > prices[t + 1]:
            labels[t] = 'max'
        elif prices[t] < prices[t - 1] and prices[t] < prices[t + 1]:
            labels[t] = 'min'
        else:
            labels[t] = '-'
    
    return labels

df['target'] = find_local_extrema(df['Price'].values)
df

Unnamed: 0,Timestamp,Price,target
0,2022-01-01 00:00:00,46656.12,
1,2022-01-01 01:00:00,46779.46,-
2,2022-01-01 02:00:00,46805.99,-
3,2022-01-01 03:00:00,46814.07,max
4,2022-01-01 04:00:00,46708.83,min
...,...,...,...
26180,2024-12-26 20:00:00,95591.40,min
26181,2024-12-26 21:00:00,95812.04,-
26182,2024-12-26 22:00:00,95812.73,max
26183,2024-12-26 23:00:00,95783.86,min


### Feature engineering

In [221]:
# A titolo di esempio, calcoliamo:
# - 1ª, 2ª, 3ª derivata
# - medie mobili
# - deviazioni standard mobili
# - variazioni percentuali
# Nota: potresti aggiungere molte altre features.

# a) 1ª derivata (differenza)
df['diff1_price'] = df['Price'].diff()

# b) 2ª derivata
df['diff2_price'] = df['diff1_price'].diff()

# c) 3ª derivata
df['diff3_price'] = df['diff2_price'].diff()

# d) medie mobili (SMA) a diverse finestre
window_sizes =  [3, 6, 10, 12, 20, 24]
for w in window_sizes:
    col_name = f'sma_price_{w}'
    df[col_name] = df['Price'].rolling(window=w).mean()

# e) deviazione standard mobile
for w in window_sizes:
    col_name = f'std_price_{w}'
    df[col_name] = df['Price'].rolling(window=w).std()

# f) variazione percentuale (Price e altre derivate)
df['pct_price'] = df['Price'].pct_change()
# df['pct_diff1_price'] = df['diff1_price'].pct_change()
# df['pct_diff2_price'] = df['diff2_price'].pct_change()
# df['pct_diff3_price'] = df['diff3_price'].pct_change()

# g) medie mobili delle variazioni percentuali
for w in window_sizes:
    col_name = f'sma_pct_price_{w}'
    df[col_name] = df['pct_price'].rolling(window=w).mean()

# h) minimi e massimi mobili di varie colonne
min_max_windows = [3, 6, 10, 12, 20, 24]
for w in min_max_windows:
    col_name_min = f'min_price_{w}'
    col_name_max = f'max_price_{w}'
    df[col_name_min] = df['Price'].rolling(window=w).min()
    df[col_name_max] = df['Price'].rolling(window=w).max()

# i) tempo trascorso dal precedente min e dal precedente max
time_since_last_min = []
time_since_last_max = []

last_min_idx = None
last_max_idx = None

for i in range(len(df)):
    # Calcola tempo dall'ultimo min (se esiste)
    if last_min_idx is None:
        time_since_last_min.append(None)
    else:
        time_since_last_min.append(i - last_min_idx)
    
    # Calcola tempo dall'ultimo max (se esiste)
    if last_max_idx is None:
        time_since_last_max.append(None)
    else:
        time_since_last_max.append(i - last_max_idx)
    
    # Dopo aver calcolato il tempo, aggiorna last_min_idx / last_max_idx
    # solo se oggi è un min/max
    if df.loc[i, 'target'] == 'min':
        last_min_idx = i
    elif df.loc[i, 'target'] == 'max':
        last_max_idx = i

df['time_since_last_min'] = time_since_last_min
df['time_since_last_max'] = time_since_last_max


# j) rapporto tra tutte le possibili coppie di colonne (escluso Timestamp),
#    evitando i reciproci (es. Price/diff1_price sì, diff1_price/Price no).

# 1) Identifichiamo le colonne numeric da combinare (escludendo 'Timestamp' se c'è)
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = [col for col in numeric_cols if col != 'Timestamp']

# 2) Creiamo i rapporti col_i / col_j per i < j (per evitare i reciproci)
for i in range(len(numeric_cols)):
    for j in range(i + 1, len(numeric_cols)):
        col_i = numeric_cols[i]
        col_j = numeric_cols[j]
        
        new_col_name = f'{col_i}/{col_j}'
        
        # Calcolo del rapporto, gestendo eventuali divisioni per zero
        # Genererà inf o -inf se col_j è zero, che poi potrai sostituire con NaN
        df[new_col_name] = df[col_i] / df[col_j]

# Nota: puoi davvero sbizzarrirti e creare combinazioni non lineari, rapporti tra colonne, ecc.

df

  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_col_name] = df[col_i] / df[col_j]
  df[new_co

Unnamed: 0,Timestamp,Price,target,diff1_price,diff2_price,diff3_price,sma_price_3,sma_price_6,sma_price_10,sma_price_12,...,max_price_20/min_price_24,max_price_20/max_price_24,max_price_20/time_since_last_min,max_price_20/time_since_last_max,min_price_24/max_price_24,min_price_24/time_since_last_min,min_price_24/time_since_last_max,max_price_24/time_since_last_min,max_price_24/time_since_last_max,time_since_last_min/time_since_last_max
0,2022-01-01 00:00:00,46656.12,,,,,,,,,...,,,,,,,,,,
1,2022-01-01 01:00:00,46779.46,-,123.34,,,,,,,...,,,,,,,,,,
2,2022-01-01 02:00:00,46805.99,-,26.53,-96.81,,46747.190000,,,,...,,,,,,,,,,
3,2022-01-01 03:00:00,46814.07,max,8.08,-18.45,78.36,46799.840000,,,,...,,,,,,,,,,
4,2022-01-01 04:00:00,46708.83,min,-105.24,-113.32,-94.87,46776.296667,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26180,2024-12-26 20:00:00,95591.40,min,-509.07,-871.64,-1311.37,95809.923333,95966.443333,95833.378,95773.471667,...,1.037948,0.996187,49530.845000,99061.690000,0.959765,47719.955000,95439.910000,49720.435000,99440.870000,2.000000
26181,2024-12-26 21:00:00,95812.04,-,220.64,729.71,1601.35,95834.636667,95929.046667,95843.719,95804.482500,...,1.037205,0.995474,98990.770000,49495.385000,0.959765,95439.910000,47719.955000,99440.870000,49720.435000,0.500000
26182,2024-12-26 22:00:00,95812.73,max,0.69,-219.95,-949.66,95738.723333,95811.600000,95856.995,95829.879167,...,1.037205,0.995474,49495.385000,32996.923333,0.959765,47719.955000,31813.303333,49720.435000,33146.956667,0.666667
26183,2024-12-26 23:00:00,95783.86,min,-28.87,-29.56,190.39,95802.876667,95806.400000,95866.291,95836.148333,...,1.030753,0.993068,32791.663333,98374.990000,0.963439,31813.303333,95439.910000,33020.563333,99061.690000,3.000000


### Pulizia dei dati e rimozione NaN

In [222]:
# Tutte le rolling o diff generano valori NaN iniziali.
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Timestamp,Price,target,diff1_price,diff2_price,diff3_price,sma_price_3,sma_price_6,sma_price_10,sma_price_12,...,max_price_20/min_price_24,max_price_20/max_price_24,max_price_20/time_since_last_min,max_price_20/time_since_last_max,min_price_24/max_price_24,min_price_24/time_since_last_min,min_price_24/time_since_last_max,max_price_24/time_since_last_min,max_price_24/time_since_last_max,time_since_last_min/time_since_last_max
0,2022-01-02 00:00:00,47641.02,-,-88.79,-386.99,-580.37,47600.813333,47479.045000,47469.330,47387.733333,...,1.022294,1.000000,15916.716667,47750.150000,0.978192,15569.610000,46708.830000,15916.716667,47750.150000,3.000000
1,2022-01-02 01:00:00,47374.11,-,-266.91,-178.12,208.87,47581.646667,47485.863333,47484.952,47416.159167,...,1.022294,1.000000,11937.537500,23875.075000,0.978192,11677.207500,23354.415000,11937.537500,23875.075000,2.000000
2,2022-01-02 02:00:00,47372.18,-,-1.93,264.98,443.10,47462.436667,47479.253333,47494.149,47453.299167,...,1.022294,1.000000,9550.030000,15916.716667,0.978192,9341.766000,15569.610000,9550.030000,15916.716667,1.666667
3,2022-01-02 03:00:00,47323.52,-,-48.66,-46.73,-311.71,47356.603333,47478.708333,47451.486,47462.101667,...,1.022294,1.000000,7958.358333,11937.537500,0.978192,7784.805000,11677.207500,7958.358333,11937.537500,1.500000
4,2022-01-02 04:00:00,46887.59,min,-435.93,-387.27,-340.54,47194.430000,47388.038333,47383.167,47429.383333,...,1.021252,1.000000,6821.450000,9550.030000,0.979190,6679.494286,9351.292000,6821.450000,9550.030000,1.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26148,2024-12-26 19:00:00,96100.47,max,362.57,439.73,-185.46,95884.476667,95943.813333,95825.035,95796.720833,...,1.037948,0.996187,99061.690000,33020.563333,0.959765,95439.910000,31813.303333,99440.870000,33146.956667,0.333333
26149,2024-12-26 20:00:00,95591.40,min,-509.07,-871.64,-1311.37,95809.923333,95966.443333,95833.378,95773.471667,...,1.037948,0.996187,49530.845000,99061.690000,0.959765,47719.955000,95439.910000,49720.435000,99440.870000,2.000000
26150,2024-12-26 21:00:00,95812.04,-,220.64,729.71,1601.35,95834.636667,95929.046667,95843.719,95804.482500,...,1.037205,0.995474,98990.770000,49495.385000,0.959765,95439.910000,47719.955000,99440.870000,49720.435000,0.500000
26151,2024-12-26 22:00:00,95812.73,max,0.69,-219.95,-949.66,95738.723333,95811.600000,95856.995,95829.879167,...,1.037205,0.995474,49495.385000,32996.923333,0.959765,47719.955000,31813.303333,49720.435000,33146.956667,0.666667


In [223]:
# # Filtra solo le colonne numeriche
# numeric_cols = df.select_dtypes(include=[np.number])

# # Individua righe con np.inf o -np.inf nelle colonne numeriche
# mask = numeric_cols.applymap(np.isinf).any(axis=1)

# # Filtra il DataFrame originale
# result = df[mask]

# result

In [224]:
# # Filtra solo le colonne numeriche
# numeric_cols = df.select_dtypes(include=[np.number])

# # Crea una maschera booleana per individuare infiniti
# mask = numeric_cols.applymap(np.isinf)

# # Trova le colonne che contengono almeno un infinito
# columns_with_infs = mask.any(axis=0)  # Restituisce una Serie booleana

# # Filtra i nomi delle colonne con True
# columns_with_infs_list = columns_with_infs[columns_with_infs].index.tolist()

# print("Colonne con valori infiniti:", columns_with_infs_list)

In [225]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Timestamp,Price,target,diff1_price,diff2_price,diff3_price,sma_price_3,sma_price_6,sma_price_10,sma_price_12,...,max_price_20/min_price_24,max_price_20/max_price_24,max_price_20/time_since_last_min,max_price_20/time_since_last_max,min_price_24/max_price_24,min_price_24/time_since_last_min,min_price_24/time_since_last_max,max_price_24/time_since_last_min,max_price_24/time_since_last_max,time_since_last_min/time_since_last_max
0,2022-01-02 00:00:00,47641.02,-,-88.79,-386.99,-580.37,47600.813333,47479.045000,47469.330,47387.733333,...,1.022294,1.000000,15916.716667,47750.150000,0.978192,15569.610000,46708.830000,15916.716667,47750.150000,3.000000
1,2022-01-02 01:00:00,47374.11,-,-266.91,-178.12,208.87,47581.646667,47485.863333,47484.952,47416.159167,...,1.022294,1.000000,11937.537500,23875.075000,0.978192,11677.207500,23354.415000,11937.537500,23875.075000,2.000000
2,2022-01-02 02:00:00,47372.18,-,-1.93,264.98,443.10,47462.436667,47479.253333,47494.149,47453.299167,...,1.022294,1.000000,9550.030000,15916.716667,0.978192,9341.766000,15569.610000,9550.030000,15916.716667,1.666667
3,2022-01-02 03:00:00,47323.52,-,-48.66,-46.73,-311.71,47356.603333,47478.708333,47451.486,47462.101667,...,1.022294,1.000000,7958.358333,11937.537500,0.978192,7784.805000,11677.207500,7958.358333,11937.537500,1.500000
4,2022-01-02 04:00:00,46887.59,min,-435.93,-387.27,-340.54,47194.430000,47388.038333,47383.167,47429.383333,...,1.021252,1.000000,6821.450000,9550.030000,0.979190,6679.494286,9351.292000,6821.450000,9550.030000,1.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26146,2024-12-26 19:00:00,96100.47,max,362.57,439.73,-185.46,95884.476667,95943.813333,95825.035,95796.720833,...,1.037948,0.996187,99061.690000,33020.563333,0.959765,95439.910000,31813.303333,99440.870000,33146.956667,0.333333
26147,2024-12-26 20:00:00,95591.40,min,-509.07,-871.64,-1311.37,95809.923333,95966.443333,95833.378,95773.471667,...,1.037948,0.996187,49530.845000,99061.690000,0.959765,47719.955000,95439.910000,49720.435000,99440.870000,2.000000
26148,2024-12-26 21:00:00,95812.04,-,220.64,729.71,1601.35,95834.636667,95929.046667,95843.719,95804.482500,...,1.037205,0.995474,98990.770000,49495.385000,0.959765,95439.910000,47719.955000,99440.870000,49720.435000,0.500000
26149,2024-12-26 22:00:00,95812.73,max,0.69,-219.95,-949.66,95738.723333,95811.600000,95856.995,95829.879167,...,1.037205,0.995474,49495.385000,32996.923333,0.959765,47719.955000,31813.303333,49720.435000,33146.956667,0.666667


### Scelta di input e target

In [226]:
# Le feature saranno tutte le colonne create, tranne 'Timestamp' e 'target'
excluded_cols = ['Timestamp', 'target']
feature_cols = [c for c in df.columns if c not in excluded_cols]

X = df[feature_cols].values
y = df['target'].values

# Poiché è un problema a 3 classi (-1, 0, +1), convertiamo in classi 0,1,2
# Mappiamo: -1 -> 0, 0 -> 1, +1 -> 2
# (Oppure puoi tenerlo in formato -1,0,+1 e usare un approccio differente)
# mapping = { -1: 0, 0: 1, 1: 2 }
# y_mapped = np.array([mapping[label] for label in y])

### Suddivisione in train, validation e test

In [227]:
# Esempio: 70% train, 15% validation, 15% test
# Per una separazione temporale più rigorosa, potresti tagliare i dati a mano
# in base a index o a date, invece di usare train_test_split casuale.
train_size = 0.7
val_size = 0.15
test_size = 0.15

# Primo split train + remainder
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=(1 - train_size), shuffle=False  # shuffle=False per mantenere ordine temporale
)

# Split del remainder in validation e test
ratio = test_size / (val_size + test_size)  # frazione del remainder che andrà al test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=ratio, shuffle=False
)

print("Forma X_train:", X_train.shape)
print("Forma X_val:", X_val.shape)
print("Forma X_test:", X_test.shape)

Forma X_train: (18305, 703)
Forma X_val: (3923, 703)
Forma X_test: (3923, 703)


### Normalizzazione (StandardScaler)

In [228]:
scaler = StandardScaler()
scaler.fit(X_train)  # calcoliamo media e std sul train

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Modello di classificazione

#### Reti neurali

In [229]:
# # Creiamo le dummies sulle stringhe 'min', '-', 'max'
# # Otteniamo 3 colonne (una per classe)
# y_train_dum = pd.get_dummies(y_train)
# y_val_dum   = pd.get_dummies(y_val)
# y_test_dum  = pd.get_dummies(y_test)

# print("Columns in y_train_dum:", y_train_dum.columns)
# print("Columns in y_val_dum  :", y_val_dum.columns)
# print("Columns in y_test_dum :", y_test_dum.columns)

# # Assicuriamoci che l'ordine delle colonne delle dummies sia identico nei 3 set
# # Di solito, se compaiono le stesse 3 categorie in train/val/test, l'ordine coincide. 
# # Se alcune categorie non compaiono in un subset, puoi fare:
# # all_cats = ['-', 'max', 'min']  # o l'ordine che preferisci
# # y_train_dum = y_train_dum.reindex(columns=all_cats, fill_value=0)
# # y_val_dum   = y_val_dum.reindex(columns=all_cats, fill_value=0)
# # y_test_dum  = y_test_dum.reindex(columns=all_cats, fill_value=0)

# # Da dataframe passiamo a numpy array
# y_train_oh = y_train_dum.values
# y_val_oh   = y_val_dum.values
# y_test_oh  = y_test_dum.values

In [230]:
# # Costruiamo la rete neurale (3 neuroni in output -> 3 classi)
# model = Sequential()
# model.add(Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]))
# # model.add(Dropout(0.3))
# model.add(Dense(32, activation='relu'))
# # model.add(Dropout(0.3))
# model.add(Dense(3, activation='softmax'))

# model.compile(
#     optimizer=Adam(learning_rate=0.001),
#     loss='categorical_crossentropy',
#     metrics=['accuracy']
# )

# early_stop = EarlyStopping(
#     monitor='val_loss',
#     patience=20,              # numero di epoch "tollerate" senza miglioramenti
#     restore_best_weights=True
# )

# history = model.fit(
#     X_train_scaled,
#     y_train_oh,
#     validation_data=(X_val_scaled, y_val_oh),
#     epochs=1000,          # un limite ampio per lasciare lavorare EarlyStopping
#     batch_size=32,
#     callbacks=[early_stop],
#     verbose=2
# )

#### LGBM

In [231]:
# =========================================================================
# 1) Mappiamo le stringhe ('min', '-', 'max') in valori numerici (0,1,2).
# =========================================================================
class_mapping = {'min': 0, '-': 1, 'max': 2}
y_train_fac = pd.Series(y_train).map(class_mapping).values
y_val_fac   = pd.Series(y_val).map(class_mapping).values
y_test_fac  = pd.Series(y_test).map(class_mapping).values

# Creiamo i dataset di LightGBM
train_data = lgb.Dataset(X_train_scaled, label=y_train_fac)
val_data   = lgb.Dataset(X_val_scaled, label=y_val_fac, reference=train_data)

# Parametri LightGBM per una multi-classe a 3 classi
params = {
    'learning_rate': 0.01,
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'verbosity': -1
}

# Numero massimo di boosting round
num_boost_round = 100000

# Usiamo le callback:
# 1) early_stopping con pazienza = 20
# 2) log_evaluation per stampare i progressi ogni X round
callbacks_list = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(period=50)
]

# Alleniamo il modello con lgb.train
gbm = lgb.train(
    params=params,
    train_set=train_data,
    num_boost_round=num_boost_round,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    callbacks=callbacks_list
)

Training until validation scores don't improve for 1000 rounds
[50]	train's multi_logloss: 0.870795	valid's multi_logloss: 0.896146
[100]	train's multi_logloss: 0.770427	valid's multi_logloss: 0.820354
[150]	train's multi_logloss: 0.708851	valid's multi_logloss: 0.778192
[200]	train's multi_logloss: 0.667182	valid's multi_logloss: 0.754148
[250]	train's multi_logloss: 0.636242	valid's multi_logloss: 0.738475
[300]	train's multi_logloss: 0.611798	valid's multi_logloss: 0.728382
[350]	train's multi_logloss: 0.591435	valid's multi_logloss: 0.722338
[400]	train's multi_logloss: 0.57417	valid's multi_logloss: 0.71905
[450]	train's multi_logloss: 0.558803	valid's multi_logloss: 0.717079
[500]	train's multi_logloss: 0.544848	valid's multi_logloss: 0.716349
[550]	train's multi_logloss: 0.531971	valid's multi_logloss: 0.716195
[600]	train's multi_logloss: 0.520117	valid's multi_logloss: 0.716896
[650]	train's multi_logloss: 0.508786	valid's multi_logloss: 0.718362
[700]	train's multi_logloss: 0

### Valutazione del modello su Test

#### Reti neurali

In [232]:
# test_loss, test_acc = model.evaluate(X_test_scaled, y_test_oh, verbose=0)
# print(f"Test Accuracy: {test_acc:.4f}")

# y_pred_prob = model.predict(X_test_scaled)
# y_pred = y_pred_prob.argmax(axis=1)  # indice della classe predetta

# # Per costruire la ground truth in termini di indice
# # (Se y_test_dum ha colonne in ordine: ['-', 'max', 'min'], ad es.)
# # possiamo fare argmax su y_test_oh:
# y_true = y_test_oh.argmax(axis=1)

# print("\nClassification Report (Test):")
# print(classification_report(y_true, y_pred, digits=4))

# print("Confusion Matrix (Test):")
# cm = confusion_matrix(y_true, y_pred)
# cm

#### LGBM

In [233]:
# =========================================================================
# 5) Valutazione sul test
# =========================================================================
# Usiamo il modello fino al best_iteration calcolato in validazione
y_pred_prob = gbm.predict(X_test_scaled, num_iteration=gbm.best_iteration)

# y_pred_prob è un array di forma (n_test, 3) con le probabilità per ogni classe
# Ricaviamo la classe con argmax
y_pred = np.argmax(y_pred_prob, axis=1)

# =========================================================================
# 6) Metriche di valutazione
# =========================================================================
print("\nClassification Report (Test):")
print(classification_report(y_test_fac, y_pred, digits=4))

print("Confusion Matrix (Test):")
cm = confusion_matrix(y_test_fac, y_pred)
cm

# Per capire quali indici corrispondono a 'min', '-', 'max', ricordiamo che:
# 0 -> 'min', 1 -> '-', 2 -> 'max'


Classification Report (Test):
              precision    recall  f1-score   support

           0     0.5543    0.3853    0.4546      1046
           1     0.4735    0.5273    0.4990      1832
           2     0.5311    0.5876    0.5579      1045

    accuracy                         0.5055      3923
   macro avg     0.5197    0.5000    0.5038      3923
weighted avg     0.5104    0.5055    0.5028      3923

Confusion Matrix (Test):


array([[403, 643,   0],
       [324, 966, 542],
       [  0, 431, 614]])