In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout

In [80]:
df = pd.read_parquet("data/finance_ml_dataset_clean.parquet", engine = "fastparquet")
# On supprime la target en valeur
df_target = df.copy()
df_target = df_target.drop(columns=["target_returns_plus_1_days"])
# Vérification
df_target.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume,headline_concat,reddit_concat,target_updown_plus_1_days
0,2008-08-08,11432.089844,11759.959961,11388.040039,11734.320312,11734.320312,212830000,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.'. b'Rus...,1
1,2008-08-11,11729.669922,11867.110352,11675.530273,11782.349609,11782.349609,183190000,b'Why wont America and Nato help us? If they w...,"b""So this is what it's come to: trading sex fo...",0
2,2008-08-12,11781.700195,11782.349609,11601.519531,11642.469727,11642.469727,173590000,b'Remember that adorable 9-year-old who sang a...,"b""I'm Trying to Get a Sense of This Whole Geor...",0
3,2008-08-13,11632.80957,11633.780273,11453.339844,11532.959961,11532.959961,182550000,b' U.S. refuses Israel weapons to attack Iran:...,b'Witness: Russian forces head towards Tbilisi...,1
4,2008-08-14,11532.070312,11718.280273,11450.889648,11615.929688,11615.929688,159790000,b'All the experts admit that we should legalis...,b'Taliban wages war on humanitarian aid worker...,1


In [None]:
# Pré-traitement texte
#tfidf_head = TfidfVectorizer(max_features=300)
#tfidf_reddit = TfidfVectorizer(max_features=300)
#X_head = tfidf_head.fit_transform(df_target["headline_concat"]).toarray()
#X_reddit = tfidf_reddit.fit_transform(df_target["reddit_concat"]).toarray()

In [81]:
# Séparation X et Y
# On ne prends pas en compte les variables texte

# Colonnes numériques
num_cols = ["Low", "Close", "Adj Close", "Volume"]

# Target
y = df["target_updown_plus_1_days"].values  # numpy array

# Normalisation
scaler = StandardScaler()
X = scaler.fit_transform(df[num_cols])

In [82]:
# Création de séquences temporelles
# Les modèles LSTM ont besoin de séquences temporelles en entrée : (batch, time_steps, features)
# X normal --> le LSTM ne sait pas que les données sont ordonnées dans le temps.
# Permet au LSTM/Attention d’apprendre l’influence du passé sur le futur

# sur une fenêtre de 30 jours car : 
# si trop petit (ex. 5 jours) : le LSTM n’a pas assez de contexte historique
# si trop grand (ex. 365 jours) : plus de contexte, mais risque de surcharger le modèle et ralentir l’entraînement
# Possible aussi de faire un grid search sur la fenêtre, ou de tester en modifiant
def create_sequences(X, y, window=30):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

X_seq, y_seq = create_sequences(X, y, window=30)
# X_seq : vecteurs de features sur plusieurs jours consécutifs (input pour LSTM)
# y_seq : target du jour suivant

In [83]:
# Définition d'une couche d'attention Keras
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()
        self.dense = Dense(1) # couche Dense qui calcule un score d’importance pour chaque timestep

    def call(self, inputs, return_attention=False):
        score = tf.nn.softmax(self.dense(inputs), 
                              # softmax : normalise ces scores sur la dimension temporelle : somme = 1 pour chaque séquence
                                # Chaque timestep reçoit un poids relatif
                              axis=1)  # shape: (batch, time, 1) : score par timestep
        context = tf.reduce_sum(score * inputs, axis=1)    # shape: (batch, features)
        if return_attention:
            return context, score # retourne le vecteur de contexte + les poids d’attention
        return context # Sinon, retourne juste le vecteur résumé pour le modèle

In [None]:
def create_model(n_features):
    inputs = Input(shape=(30, # changer en fonction de la fenêtre temporelle
                          n_features))
    x = LSTM( # LSTM nécessaire pour l'attention car combine les timesteps
        64, # taille du vecteur de sortie (plus grand = plus de capacité mais risque d'overfitting)
             return_sequences=True # garde la sortie de tous les timesteps
             )(inputs)
    x = Dropout(0.2)(x) # régularisation pour réduire l'overfitting (plus on augmente, plus ça régularise)
    att = Attention()(x)
    x = Dense(32, # nombre de neuronnes (plus grand = plus de complexité)
              activation="selu")(att) # tester selu, tanh, gelu
    output = Dense(1, activation="sigmoid")(x) # sigmoid : permet de retrouver une probabilité entre 0 et 1
    model = Model(inputs, output)
    model.compile(optimizer="adam", 
                  loss="binary_crossentropy", # perte adaptée pour classification binaire
                  metrics=["accuracy"])
    return model

model = create_model(n_features=X_seq.shape[2])
model.summary()

In [85]:
# Entraînement
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq)):
    print(f"Fold {fold+1}")

    X_train, X_test = X_seq[train_idx], X_seq[test_idx]
    y_train, y_test = y_seq[train_idx], y_seq[test_idx]

    model = create_model(n_features=X_seq.shape[2])

    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=10,
        batch_size=32,
        verbose=1
    )

y_pred = (model.predict(X_test) > 0.5).astype(int)

# Fenêtre accuracy
# 15 : 0.5121
# 30 : 0.5122
# 60 : 0.5171
# 90 : 0.5189
# 180 : 0.5116


Fold 1
Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 91ms/step - accuracy: 0.4756 - loss: 0.7094 - val_accuracy: 0.4908 - val_loss: 0.6930
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.4970 - loss: 0.6957 - val_accuracy: 0.5736 - val_loss: 0.6837
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5061 - loss: 0.6987 - val_accuracy: 0.5736 - val_loss: 0.6815
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5274 - loss: 0.6911 - val_accuracy: 0.5920 - val_loss: 0.6858
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5518 - loss: 0.6896 - val_accuracy: 0.5736 - val_loss: 0.6825
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.5152 - loss: 0.6906 - val_accuracy: 0.5736 - val_loss: 0.6824
Epoch 7/10
[1m11/11[0m [

In [79]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
# Rapport complet
print(classification_report(y_test, y_pred))
# ROC-AUC
roc = roc_auc_score(y_test, y_pred)
print("ROC-AUC:", roc)

Accuracy: 0.5116279069767442
Confusion Matrix:
 [[  0 147]
 [  0 154]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       147
           1       0.51      1.00      0.68       154

    accuracy                           0.51       301
   macro avg       0.26      0.50      0.34       301
weighted avg       0.26      0.51      0.35       301

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [60]:
# Créer un sous-modèle pour récupérer l'attention
attention_layer = model.get_layer(index=2)  # index de la couche Attention dans ton modèle
attention_model = tf.keras.Model(inputs=model.input, outputs=attention_layer(model.input, return_attention=True))
# Récupérer les poids d'attention pour X_test
_, attention_scores = attention_model.predict(X_test)  # shape: (batch, time, 1)
attention_scores = attention_scores.squeeze(-1)         # shape: (batch, time)

TypeError: got an unexpected keyword argument 'return_attention'

In [None]:
# Visualisation des scores
# Exemple pour la première séquence
plt.figure(figsize=(10,4))
plt.plot(attention_scores[0])
plt.title("Scores d'attention sur la première séquence")
plt.xlabel("Time step")
plt.ylabel("Attention weight")
plt.show()