# ü§ñ Entra√Ænement des Mod√®les ML - MarketPulse

## Objectif

Ce notebook entra√Æne les mod√®les ML pour la pr√©diction des prix (LSTM) et l'analyse de sentiment (FinBERT).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
import torch
import logging
import os

# Configuration
np.random.seed(42)
tf.random.set_seed(42)

# Configuration de l'affichage
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Partie 1: Entra√Ænement du mod√®le LSTM pour la pr√©diction des prix

In [None]:
class LSTMModel:
    def __init__(self, sequence_length=60):
        self.sequence_length = sequence_length
        self.model = None
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        
    def prepare_data(self, data, feature_columns=['Close']):
        """Pr√©parer les donn√©es pour l'entra√Ænement LSTM"""
        # S√©lectionner les caract√©ristiques
        df = data[feature_columns].copy()
        
        # Normaliser les donn√©es
        scaled_data = self.scaler.fit_transform(df)
        
        # Cr√©er des s√©quences
        X, y = [], []
        for i in range(self.sequence_length, len(scaled_data)):
            X.append(scaled_data[i-self.sequence_length:i])
            y.append(scaled_data[i, 0])  # Utiliser le prix de cl√¥ture comme cible
        
        X, y = np.array(X), np.array(y)
        
        # Diviser en ensembles d'entra√Ænement et de test
        split_idx = int(0.8 * len(X))
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        return X_train, X_test, y_train, y_test
    
    def build_model(self):
        """Construire le mod√®le LSTM"""
        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=(self.sequence_length, 1)),
            Dropout(0.2),
            LSTM(50, return_sequences=True),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(1)
        ])
        
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model
    
    def train(self, X_train, y_train, epochs=50, batch_size=32):
        """Entra√Æner le mod√®le LSTM"""
        self.model = self.build_model()
        
        # Entra√Æner le mod√®le
        history = self.model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.1,
            verbose=1
        )
        
        return history
    
    def predict(self, X):
        """Faire des pr√©dictions"""
        return self.model.predict(X)
    
    def evaluate(self, X_test, y_test):
        """√âvaluer le mod√®le"""
        predictions = self.predict(X_test)
        
        # Inverser la transformation pour les pr√©dictions et valeurs r√©elles
        pred_actual = self.scaler.inverse_transform(
            np.concatenate([predictions, np.zeros((predictions.shape[0], 4))], axis=1)
        )[:, 0]
        y_test_actual = self.scaler.inverse_transform(
            np.concatenate([y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 4))], axis=1)
        )[:, 0]
        
        mse = mean_squared_error(y_test_actual, pred_actual)
        mae = mean_absolute_error(y_test_actual, pred_actual)
        
        return mse, mae, pred_actual, y_test_actual

In [None]:
# Charger les donn√©es ML pr√©par√©es
symbol = "AAPL"
data = pd.read_csv(f'data/processed/{symbol}_ml_data.csv')
print(f"Donn√©es charg√©es: {data.shape}")
print(data.head())

In [None]:
# Entra√Æner le mod√®le LSTM
lstm_model = LSTMModel(sequence_length=60)

# Pr√©parer les donn√©es
X_train, X_test, y_train, y_test = lstm_model.prepare_data(data[['Close']])

# Reshape pour LSTM (samples, time steps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Entra√Æner le mod√®le
history = lstm_model.train(X_train, y_train, epochs=20, batch_size=32)

In [None]:
# √âvaluer le mod√®le
mse, mae, pred_actual, y_test_actual = lstm_model.evaluate(X_test, y_test)
print(f"MSE: {mse}")
print(f"MAE: {mae}")

In [None]:
# Visualiser les r√©sultats
plt.figure(figsize=(15, 8))
plt.plot(y_test_actual, label='Valeurs r√©elles', color='blue')
plt.plot(pred_actual, label='Pr√©dictions', color='red')
plt.title(f'Pr√©dictions vs R√©alit√© - {symbol}')
plt.xlabel('Temps')
plt.ylabel('Prix')
plt.legend()
plt.show()

In [None]:
# Sauvegarder le mod√®le
model_dir = "models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
lstm_model.model.save(f"{model_dir}/lstm_model_{symbol}.h5")
print(f"Mod√®le LSTM sauvegard√©: {model_dir}/lstm_model_{symbol}.h5")

## Partie 2: Entra√Ænement du mod√®le FinBERT pour l'analyse de sentiment

In [None]:
class SentimentModel:
    def __init__(self, model_name="ProsusAI/finbert"):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.classifier = None
        
    def create_sample_data(self):
        """Cr√©er des donn√©es d'exemple pour l'entra√Ænement"""
        # Dans un sc√©nario r√©el, vous chargeriez des donn√©es r√©elles
        # Pour cet exemple, nous cr√©ons des donn√©es synth√©tiques
        texts = [
            "The company reported strong quarterly earnings, exceeding analyst expectations.",
            "Market volatility increases as trade tensions escalate between major economies.",
            "New regulatory changes could impact the financial sector significantly.",
            "Stock prices surge following positive FDA approval for new drug.",
            "Economic indicators suggest a potential slowdown in the coming quarters.",
            "Company announces major acquisition that could transform its market position.",
            "Investors show caution amid uncertainty about future economic policies.",
            "Technology sector shows robust growth with new innovation breakthroughs.",
            "Oil prices drop due to oversupply concerns in the global market.",
            "Consumer spending increases, indicating strong economic confidence."
        ]
        
        # Labels: 0=n√©gatif, 1=neutre, 2=positif
        labels = [2, 0, 1, 2, 0, 2, 1, 2, 0, 2]
        
        return pd.DataFrame({"text": texts, "label": labels})
    
    def tokenize_data(self, texts):
        """Tokeniser les textes"""
        return self.tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
    
    def train(self, data, output_dir="./finbert_sentiment", epochs=3, batch_size=8):
        """Entra√Æner le mod√®le de sentiment"""
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score, precision_recall_fscore_support
        
        # Diviser les donn√©es
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            data["text"], data["label"], test_size=0.2, random_state=42
        )
        
        # Tokeniser les donn√©es
        train_encodings = self.tokenize_data(train_texts)
        val_encodings = self.tokenize_data(val_texts)
        
        # Cr√©er une classe de dataset simple
        class FinancialNewsDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels.iloc[idx])
                return item

            def __len__(self):
                return len(self.labels)
        
        # Cr√©er les datasets
        train_dataset = FinancialNewsDataset(train_encodings, train_labels.reset_index(drop=True))
        val_dataset = FinancialNewsDataset(val_encodings, val_labels.reset_index(drop=True))
        
        # D√©finir les arguments d'entra√Ænement
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        
        # Cr√©er le trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )
        
        # Entra√Æner le mod√®le
        trainer.train()
        
        # Sauvegarder le mod√®le
        model_dir = "models/finbert_sentiment"
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        
        trainer.save_model(model_dir)
        self.tokenizer.save_pretrained(model_dir)
        
        # Cr√©er le classifieur pour l'inf√©rence
        self.classifier = pipeline(
            "sentiment-analysis",
            model="models/finbert_sentiment",
            tokenizer="models/finbert_sentiment"
        )
        
        return trainer

In [None]:
# Entra√Æner le mod√®le de sentiment
sentiment_model = SentimentModel()
data = sentiment_model.create_sample_data()

print("Donn√©es d'entra√Ænement:")
print(data)

# Entra√Æner le mod√®le (dans un environnement r√©el avec suffisamment de ressources)
# trainer = sentiment_model.train(data)

# Pour cet exemple, nous utiliserons le mod√®le pr√©-entra√Æn√©
classifier = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Tester le mod√®le
test_texts = [
    "The company's earnings exceeded expectations, showing strong growth.",
    "Market uncertainty continues to affect investor confidence.",
    "New product launch expected to drive revenue growth."
]

print("\nR√©sultats du mod√®le de sentiment:")
for text in test_texts:
    result = classifier(text)
    print(f"Texte: {text}")
    print(f"Sentiment: {result}\n")

## Conclusion

Ce notebook a entra√Æn√© deux mod√®les ML importants pour MarketPulse:

1. **Mod√®le LSTM** pour pr√©dire les prix des actions
2. **Mod√®le FinBERT** pour l'analyse de sentiment

Les mod√®les sont maintenant pr√™ts √† √™tre int√©gr√©s dans le pipeline de traitement de donn√©es.