In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout

In [4]:
df = pd.read_parquet("../data/finance_ml_dataset_clean.parquet", engine = "fastparquet")
# On supprime la target en valeur
df_target = df.copy()
df_target = df_target.drop(columns=["target_returns_plus_1_days"])
# Vérification
df_target.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,headline_concat,reddit_concat,F_1,F_2,F_3,F_4,F_5,F_6,F_7,target_updown_plus_1_days
0,2008-08-08,11432.089844,11759.959961,11388.040039,11734.320312,11734.320312,212830000,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.'. b'Rus...,-0.3066,0.007364,0.0,0.040111,0.029154,0.063476,0.078912,1
1,2008-08-11,11729.669922,11867.110352,11675.530273,11782.349609,11782.349609,183190000,b'Why wont America and Nato help us? If they w...,"b""So this is what it's come to: trading sex fo...",-0.014951,0.008143,113.924792,0.006797,0.029727,0.251849,-0.226695,0
2,2008-08-12,11781.700195,11782.349609,11601.519531,11642.469727,11642.469727,173590000,b'Remember that adorable 9-year-old who sang a...,"b""I'm Trying to Get a Sense of This Whole Geor...",0.156989,0.008199,109.168935,-0.041267,0.015027,0.196334,-0.237175,0
3,2008-08-13,11632.80957,11633.780273,11453.339844,11532.959961,11532.959961,182550000,b' U.S. refuses Israel weapons to attack Iran:...,b'Witness: Russian forces head towards Tbilisi...,0.097857,0.004724,99.079503,-0.001693,-0.010083,0.175528,-0.019893,1
4,2008-08-14,11532.070312,11718.280273,11450.889648,11615.929688,11615.929688,159790000,b'All the experts admit that we should legalis...,b'Taliban wages war on humanitarian aid worker...,0.239243,0.006529,176.78898,-0.051593,-0.047145,0.154382,0.051692,1


In [None]:
# Pré-traitement texte
#tfidf_head = TfidfVectorizer(max_features=300)
#tfidf_reddit = TfidfVectorizer(max_features=300)
#X_head = tfidf_head.fit_transform(df_target["headline_concat"]).toarray()
#X_reddit = tfidf_reddit.fit_transform(df_target["reddit_concat"]).toarray()

In [23]:
# Séparation X et Y
# On ne prends pas en compte les variables texte

# Colonnes numériques
num_cols = ["Low", "Close", "Adj Close", "Volume", "F_1", "F_2", "F_3", "F_4", "F_5", "F_6", "F_7"]

# Target
y = df["target_updown_plus_1_days"].values  # numpy array

X = (df[num_cols])

In [None]:
# Création de séquences temporelles
# Les modèles LSTM ont besoin de séquences temporelles en entrée : (batch, time_steps, features)
# X normal --> le LSTM ne sait pas que les données sont ordonnées dans le temps.
# Permet au LSTM/Attention d’apprendre l’influence du passé sur le futur

# sur une fenêtre de 30 jours car : 
# si trop petit (ex. 5 jours) : le LSTM n’a pas assez de contexte historique
# si trop grand (ex. 365 jours) : plus de contexte, mais risque de surcharger le modèle et ralentir l’entraînement
# Possible aussi de faire un grid search sur la fenêtre, ou de tester en modifiant
def create_sequences(X, y, window=30):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

X_seq, y_seq = create_sequences(X, y, window=30)
# X_seq : vecteurs de features sur plusieurs jours consécutifs (input pour LSTM)
# y_seq : target du jour suivant

# Définition d'une couche d'attention Keras
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()
        self.dense = Dense(1) # couche Dense qui calcule un score d’importance pour chaque timestep

    def call(self, inputs, return_attention=False):
        score = tf.nn.softmax(self.dense(inputs), 
                              # softmax : normalise ces scores sur la dimension temporelle : somme = 1 pour chaque séquence
                                # Chaque timestep reçoit un poids relatif
                              axis=1)  # shape: (batch, time, 1) : score par timestep
        context = tf.reduce_sum(score * inputs, axis=1)    # shape: (batch, features)
        if return_attention == True:
            return context, score # retourne le vecteur de contexte + les poids d’attention
        return context # Sinon, retourne juste le vecteur résumé pour le modèle

# Définition du modèle avec LSTM + Attention
def create_model(n_features):
    inputs = Input(shape=(30, # changer en fonction de la fenêtre temporelle
                          n_features))
    x = LSTM( # LSTM nécessaire pour l'attention car combine les timesteps
        64, # taille du vecteur de sortie (plus grand = plus de capacité mais risque d'overfitting)
             return_sequences=True # garde la sortie de tous les timesteps
             )(inputs)
    x = Dropout(0.2)(x) # régularisation pour réduire l'overfitting (plus on augmente, plus ça régularise)
    att = Attention()(x)
    x = Dense(32, # nombre de neuronnes (plus grand = plus de complexité)
              activation="selu")(att) # tester selu, tanh, gelu
    output = Dense(1, activation="sigmoid")(x) # sigmoid : permet de retrouver une probabilité entre 0 et 1
    model = Model(inputs, output)
    model.compile(optimizer="adam", 
                  loss="binary_crossentropy", # perte adaptée pour classification binaire
                  metrics=["accuracy"])
    return model

model = create_model(n_features=X_seq.shape[2])
model.summary()

In [None]:
# Entraînement
tscv = TimeSeriesSplit(n_splits=5) 
# les données sont découpées en K=5 expériences successives
# Chaque split utilise les données antérieures pour entraîner et les données postérieures pour tester
# à chaque fold le train s'aggrandit
# pratique standard : performance finale = moyenne ± écart-type sur les folds

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq)):
    print(f"Fold {fold+1}")

    X_train, X_test = X_seq[train_idx], X_seq[test_idx]
    y_train, y_test = y_seq[train_idx], y_seq[test_idx]

    # Reshape pour le scaling : (batch*time, features)
    n_timesteps = X_train.shape[1]
    n_features = X_train.shape[2]

    scaler = StandardScaler()

    X_train_2d = X_train.reshape(-1, n_features)
    X_test_2d = X_test.reshape(-1, n_features)

    X_train_scaled = scaler.fit_transform(X_train_2d)
    X_test_scaled = scaler.transform(X_test_2d)

    # Retour en 3D
    X_train_scaled = X_train_scaled.reshape(-1, n_timesteps, n_features)
    X_test_scaled = X_test_scaled.reshape(-1, n_timesteps, n_features)

    model = create_model(n_features=n_features)

    model.fit(
        X_train_scaled, y_train,
        validation_data=(X_test_scaled, y_test),
        epochs=10,
        batch_size=32,
        verbose=1
    )

# Prédiction sur le dernier fold
y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)


Fold 1
Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 105ms/step - accuracy: 0.5213 - loss: 0.6928 - val_accuracy: 0.5521 - val_loss: 0.6848
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5701 - loss: 0.6816 - val_accuracy: 0.5613 - val_loss: 0.6800
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5976 - loss: 0.6739 - val_accuracy: 0.5583 - val_loss: 0.6828
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5945 - loss: 0.6748 - val_accuracy: 0.5644 - val_loss: 0.6802
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5701 - loss: 0.6720 - val_accuracy: 0.5399 - val_loss: 0.6861
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5945 - loss: 0.6692 - val_accuracy: 0.5583 - val_loss: 0.6812
Epoch 7/10
[1m11/11[0m 

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
# Rapport complet
print(classification_report(y_test, y_pred))
# ROC-AUC
roc = roc_auc_score(y_test, y_pred)
print("ROC-AUC:", roc)

Accuracy: 0.50920245398773
Confusion Matrix:
 [[ 46 113]
 [ 47 120]]
              precision    recall  f1-score   support

           0       0.49      0.29      0.37       159
           1       0.52      0.72      0.60       167

    accuracy                           0.51       326
   macro avg       0.50      0.50      0.48       326
weighted avg       0.51      0.51      0.49       326

ROC-AUC: 0.503935525176063


In [None]:
import tensorflow as tf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score

# Nombre de folds
tscv = TimeSeriesSplit(n_splits=5)

# Listes pour stocker les métriques par fold
acc_list = []
roc_list = []
precision_list = []
recall_list = []
f1_list = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq)):
    print(f"Fold {fold + 1}")

    # Split temporel
    X_train, X_test = X_seq[train_idx], X_seq[test_idx]
    y_train, y_test = y_seq[train_idx], y_seq[test_idx]

    n_timesteps = X_train.shape[1]
    n_features = X_train.shape[2]

    # Scaling
    scaler = StandardScaler()
    X_train_2d = X_train.reshape(-1, n_features)
    X_test_2d = X_test.reshape(-1, n_features)
    X_train_scaled = scaler.fit_transform(X_train_2d)
    X_test_scaled = scaler.transform(X_test_2d)
    X_train_scaled = X_train_scaled.reshape(-1, n_timesteps, n_features)
    X_test_scaled = X_test_scaled.reshape(-1, n_timesteps, n_features)

    # Création d'un modèle par fold
    model = create_model(n_features=n_features)

    # Entraînement
    model.fit(
        X_train_scaled, y_train,
        validation_data=(X_test_scaled, y_test),
        epochs=10, # le modèle passe 10 fois sur toutes les données d'entrainement pour chaque fold
        batch_size=32,
        verbose=1
    )

    # Prédiction sur le test
    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

    # Calcul des métriques
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    acc_list.append(acc)
    roc_list.append(roc)
    precision_list.append(precision_score(y_test, y_pred, average=None))
    recall_list.append(recall_score(y_test, y_pred, average=None))
    f1_list.append(f1_score(y_test, y_pred, average=None))

    print(f"Accuracy Fold {fold+1}: {acc:.4f}")
    print(f"ROC-AUC Fold {fold+1}: {roc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Moyenne et écart-type des métriques
print("Performance globale :")
print(f"Accuracy moyenne: {np.mean(acc_list):.4f} ± {np.std(acc_list):.4f}")
print(f"Precision moyenne: {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"Recall moyen: {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"F1-score moyen: {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"ROC-AUC moyen: {np.mean(roc_list):.4f} ± {np.std(roc_list):.4f}")


Fold 1
Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 92ms/step - accuracy: 0.5000 - loss: 0.6977 - val_accuracy: 0.5736 - val_loss: 0.6866
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5549 - loss: 0.6827 - val_accuracy: 0.5521 - val_loss: 0.6836
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5823 - loss: 0.6781 - val_accuracy: 0.5767 - val_loss: 0.6854
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5701 - loss: 0.6789 - val_accuracy: 0.5706 - val_loss: 0.6870
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5732 - loss: 0.6744 - val_accuracy: 0.5491 - val_loss: 0.6942
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5945 - loss: 0.6748 - val_accuracy: 0.5706 - val_loss: 0.6865
Epoch 7/10
[1m11/11[0m [

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.5270 - loss: 0.6931 - val_accuracy: 0.5123 - val_loss: 0.6996
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5429 - loss: 0.6905 - val_accuracy: 0.4877 - val_loss: 0.6979
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5398 - loss: 0.6876 - val_accuracy: 0.4969 - val_loss: 0.6940
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5404 - loss: 0.6876 - val_accuracy: 0.5092 - val_loss: 0.6962
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5564 - loss: 0.6850 - val_accuracy: 0.4939 - val_loss: 0.7003
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5490 - loss: 0.6855 - val_accuracy: 0.4939 - val_loss: 0.6986
Epoch 7/10
[1m51/51[0m [32m━━━━

In [40]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense, Layer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# --- Création de séquences temporelles ---
# Les modèles LSTM ont besoin de séquences temporelles en entrée : (batch, time_steps, features)
# X normal --> le LSTM ne sait pas que les données sont ordonnées dans le temps.
# Permet au LSTM/Attention d’apprendre l’influence du passé sur le futur
# On peut tester différentes tailles de fenêtre pour trouver le meilleur compromis
def create_sequences(X, y, window):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])  # vecteurs de features sur plusieurs jours consécutifs
        y_seq.append(y[i])           # target du jour suivant
    return np.array(X_seq), np.array(y_seq)

# --- Définition d'une couche d'attention personnalisée ---
# Chaque timestep reçoit un poids relatif selon son importance pour la prédiction
# La couche renvoie un vecteur de contexte : résumé pondéré des timesteps
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()
        self.dense = Dense(1)  # couche Dense qui calcule un score d’importance pour chaque timestep

    def call(self, inputs, return_attention=False):
        score = tf.nn.softmax(self.dense(inputs), axis=1)  # normalise les scores sur la dimension temporelle
        context = tf.reduce_sum(score * inputs, axis=1)    # vecteur résumé pondéré
        if return_attention:
            return context, score
        return context

# --- Définition du modèle LSTM + Attention ---
def create_model(n_timesteps, n_features):
    inputs = Input(shape=(n_timesteps, n_features))
    x = LSTM(64, return_sequences=True)(inputs)  # LSTM nécessaire pour combiner les timesteps
    x = Dropout(0.2)(x)                          # régularisation pour réduire l'overfitting
    att = Attention()(x)                          # attention sur tous les timesteps
    x = Dense(32, activation="selu")(att)        # dense pour combiner les informations (selu, tanh, gelu)
    output = Dense(1, activation="sigmoid")(x)   # sigmoid pour obtenir une probabilité entre 0 et 1
    model = Model(inputs, output)
    model.compile(optimizer="adam", 
                  loss="binary_crossentropy",  # perte adaptée pour classification binaire
                  metrics=["accuracy"])
    return model

# --- Liste de fenêtres temporelles à tester ---
windows = [10, 20, 30, 50, 60]  
results = {}

for window in windows:
    print(f"\n=== Fenêtre temporelle = {window} ===")
    X_seq, y_seq = create_sequences(X, y, window)
    n_timesteps = X_seq.shape[1]
    n_features = X_seq.shape[2]

    tscv = TimeSeriesSplit(n_splits=5)
    acc_list = []
    roc_list = []
    precision_list = []
    recall_list = []
    f1_list = []

    for train_idx, test_idx in tscv.split(X_seq):
        X_train, X_test = X_seq[train_idx], X_seq[test_idx]
        y_train, y_test = y_seq[train_idx], y_seq[test_idx]

        # --- Scaling sur le fold ---
        scaler = StandardScaler()
        X_train_2d = X_train.reshape(-1, n_features)
        X_test_2d = X_test.reshape(-1, n_features)
        X_train_scaled = scaler.fit_transform(X_train_2d).reshape(-1, n_timesteps, n_features)
        X_test_scaled = scaler.transform(X_test_2d).reshape(-1, n_timesteps, n_features)

        # --- Création et entraînement du modèle ---
        model = create_model(n_timesteps, n_features)
        model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test),
                  epochs=10, batch_size=32, verbose=0)

        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)
        acc_list.append(accuracy_score(y_test, y_pred))

    results[window] = np.mean(acc_list)
    print(f"Accuracy moyenne: {results[window]:.4f}")

# --- Sélection de la fenêtre optimale - par rapport à l'accuracy ---
print("\nFenêtre optimale:", max(results, key=results.get), 
      "avec accuracy =", max(results.values()))


=== Fenêtre temporelle = 10 ===
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
Accuracy moyenne: 0.5046

=== Fenêtre temporelle = 20 ===
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step
Accuracy moyenne: 0.4915

=== Fenêtre temporelle = 30 ===
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━