In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
df = pd.read_parquet("../data/finance_ml_dataset_clean.parquet", engine = "fastparquet")
# On supprime la target en valeur
df_target = df.copy()
df_target = df_target.drop(columns=["target_returns_plus_1_days"])
# Vérification
df_target.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,headline_concat,reddit_concat,F_1,F_2,F_3,F_4,F_5,F_6,F_7,target_updown_plus_1_days
0,2008-08-08,11432.089844,11759.959961,11388.040039,11734.320312,11734.320312,212830000,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.'. b'Rus...,-0.3066,0.007364,0.0,0.040111,0.029154,0.063476,0.078912,1
1,2008-08-11,11729.669922,11867.110352,11675.530273,11782.349609,11782.349609,183190000,b'Why wont America and Nato help us? If they w...,"b""So this is what it's come to: trading sex fo...",-0.014951,0.008143,113.924792,0.006797,0.029727,0.251849,-0.226695,0
2,2008-08-12,11781.700195,11782.349609,11601.519531,11642.469727,11642.469727,173590000,b'Remember that adorable 9-year-old who sang a...,"b""I'm Trying to Get a Sense of This Whole Geor...",0.156989,0.008199,109.168935,-0.041267,0.015027,0.196334,-0.237175,0
3,2008-08-13,11632.80957,11633.780273,11453.339844,11532.959961,11532.959961,182550000,b' U.S. refuses Israel weapons to attack Iran:...,b'Witness: Russian forces head towards Tbilisi...,0.097857,0.004724,99.079503,-0.001693,-0.010083,0.175528,-0.019893,1
4,2008-08-14,11532.070312,11718.280273,11450.889648,11615.929688,11615.929688,159790000,b'All the experts admit that we should legalis...,b'Taliban wages war on humanitarian aid worker...,0.239243,0.006529,176.78898,-0.051593,-0.047145,0.154382,0.051692,1


In [None]:
# Séparation X et Y
# On ne prends pas en compte les variables textes

# Colonnes numériques
num_cols = ["Low", "Close", "Adj Close", "Volume", "F_1", "F_2", "F_3", "F_4", "F_5", "F_6", "F_7"]

# Target
y = df["target_updown_plus_1_days"].values 

X = (df[num_cols])

In [None]:
# --- Création de séquences temporelles ---
# Les modèles LSTM ont besoin de séquences temporelles en entrée : (batch, time_steps, features)
# X normal --> le LSTM ne sait pas que les données sont ordonnées dans le temps.
# Permet au LSTM/Attention d’apprendre l’influence du passé sur le futur
# On peut tester différentes tailles de fenêtre pour trouver le meilleur compromis
def create_sequences(X, y, window):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])  # vecteurs de features sur plusieurs jours consécutifs
        y_seq.append(y[i])           # target du jour suivant
    return np.array(X_seq), np.array(y_seq)

# --- Définition d'une couche d'attention personnalisée ---
# Chaque timestep reçoit un poids relatif selon son importance pour la prédiction
# La couche renvoie un vecteur de contexte : résumé pondéré des timesteps
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()
        self.dense = Dense(1)  # couche Dense qui calcule un score d’importance pour chaque timestep

    def call(self, inputs, return_attention=False):
        score = tf.nn.softmax(self.dense(inputs), axis=1)  # normalise les scores sur la dimension temporelle
        context = tf.reduce_sum(score * inputs, axis=1)    # vecteur résumé pondéré
        if return_attention:
            return context, score
        return context

# --- Définition du modèle LSTM + Attention ---
def create_model(n_timesteps, n_features):
    inputs = Input(shape=(n_timesteps, n_features))
    x = LSTM(64, return_sequences=True)(inputs)  # LSTM nécessaire pour combiner les timesteps
    x = Dropout(0.2)(x)                          # régularisation pour réduire l'overfitting
    att = Attention()(x)                          # attention sur tous les timesteps
    x = Dense(32, activation="selu")(att)        # dense pour combiner les informations (on pourrait tester selu, tanh, gelu)
    output = Dense(1, activation="sigmoid")(x)   # sigmoid pour obtenir une probabilité entre 0 et 1
    model = Model(inputs, output)
    model.compile(optimizer="adam", 
                  loss="binary_crossentropy",  # perte adaptée pour classification binaire
                  metrics=["accuracy"])
    return model

In [49]:
# --- Liste de fenêtres temporelles à tester ---
windows = [10, 20, 30, 50, 60]  
results = {}

for window in windows:
    print(f"\n=== Fenêtre temporelle = {window} ===")
    X_seq, y_seq = create_sequences(X, y, window)  # Création des séquences selon la fenêtre testée
    n_timesteps = X_seq.shape[1]
    n_features = X_seq.shape[2]

    tscv = TimeSeriesSplit(n_splits=5)
    y_pred_all = []  # liste pour stocker toutes les prédictions
    y_true_all = []  # pour stocker les vraies valeurs correspondantes
    acc_list = []
    roc_list = []
    precision_list = []
    recall_list = []
    f1_list = []

    for train_idx, test_idx in tscv.split(X_seq):
        X_train, X_test = X_seq[train_idx], X_seq[test_idx]
        y_train, y_test = y_seq[train_idx], y_seq[test_idx]

        # --- Scaling sur le fold ---
        scaler = StandardScaler()
        X_train_2d = X_train.reshape(-1, n_features)
        X_test_2d = X_test.reshape(-1, n_features)
        X_train_scaled = scaler.fit_transform(X_train_2d).reshape(-1, n_timesteps, n_features)
        X_test_scaled = scaler.transform(X_test_2d).reshape(-1, n_timesteps, n_features)

        # --- Création et entraînement du modèle ---
        model = create_model(n_timesteps, n_features)
        model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test),
                  epochs=10, batch_size=32, verbose=0)

        # --- Prédiction sur le fold ---
        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

        # --- Stockage des prédictions et des vraies valeurs ---
        y_pred_all.extend(y_pred.flatten())  # On accumule dans la liste
        y_true_all.extend(y_test.flatten())  # Idem pour les vraies valeurs

        # --- Calcul des métriques du fold ---
        acc_list.append(accuracy_score(y_test, y_pred))
        roc_list.append(roc_auc_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred, average="macro"))
        recall_list.append(recall_score(y_test, y_pred, average="macro"))
        f1_list.append(f1_score(y_test, y_pred, average="macro"))

    # --- Conversion en arrays après avoir accumulé toutes les prédictions ---
    y_pred_all = np.array(y_pred_all)
    y_true_all = np.array(y_true_all)

    # --- Calcul des métriques globales pour la fenêtre ---
    print("=== Métriques globales pour la fenêtre ===")
    print("Accuracy:", accuracy_score(y_true_all, y_pred_all))
    print("Precision:", precision_score(y_true_all, y_pred_all, average="macro"))
    print("Recall:", recall_score(y_true_all, y_pred_all, average="macro"))
    print("F1-score:", f1_score(y_true_all, y_pred_all, average="macro"))
    print("ROC-AUC:", roc_auc_score(y_true_all, y_pred_all))

    # --- Matrice de confusion globale ---
    cm = confusion_matrix(y_true_all, y_pred_all)
    print("Confusion Matrix:")
    print(cm)

    # --- Rapport de classification complet ---
    print("\nClassification Report:")
    print(classification_report(y_true_all, y_pred_all))



=== Fenêtre temporelle = 10 ===
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step
=== Métriques globales pour la fenêtre ===
Accuracy: 0.49969604863221884
Precision: 0.4852097672894133
Recall: 0.48656622219840606
F1-score: 0.4792439577820782
ROC-AUC: 0.48656622219840606
Confusion Matrix:
[[248 506]
 [317 574]]

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.33      0.38       754
           1       0.53      0.64      0.58       891

    accuracy                           0.50      1645
   macro avg       0.49      0.49      0.48      1645
weighted avg       0.49      0.50      0.49      1645


=== Fenêtre tem

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step
=== Métriques globales pour la fenêtre ===
Accuracy: 0.4873065015479876
Precision: 0.4896198830409356
Recall: 0.48958937854223517
F1-score: 0.4872496871088861
ROC-AUC: 0.4895893785422352
Confusion Matrix:
[[385 358]
 [470 402]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.52      0.48       743
           1       0.53      0.46      0.49       872

    accuracy                           0.49      1615
   macro avg       0.49      0.49      0.49      1615
weighted avg       0.49      0.49      0.49      1615


=== Fenêtre temporelle = 60 ===
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
=== Métriques globales pour la fenêtre ===
Accuracy: 0.5152647975077882
Precision: 0.4869358442997116
Recall: 0.49144794216108256
F1-score: 0.4536478606031755
ROC-AUC: 0.4914479421610825
Confusion Matrix:
[[144 594]
 [184 683]]

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.20      0.27       738
           1       0.53      0.79      0.64       867

    accuracy                           0.52      1605
   macro avg       0.49      0.49      0.45      1605
weighted avg       0.49      0.52      0.47      1605

