# üìä Exploration des Donn√©es - MarketPulse: Real-Time Sentiment & Price Anomaly Detection System

## Objectif

Ce notebook explore les donn√©es financi√®res pour comprendre les tendances, d√©tecter les anomalies et pr√©parer l'entra√Ænement des mod√®les ML pour le syst√®me de d√©tection d'anomalies.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

# Configuration de l'affichage
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# T√©l√©charger des donn√©es d'exemple
symbols = ["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN"]
data = {}

for symbol in symbols:
    print(f"T√©l√©chargement des donn√©es pour {symbol}...")
    stock = yf.Ticker(symbol)
    data[symbol] = stock.history(period="2y")  # 2 ans de donn√©es
    data[symbol]['Symbol'] = symbol
    print(f"  - {len(data[symbol])} enregistrements t√©l√©charg√©s")

In [None]:
# Combiner toutes les donn√©es
all_data = pd.concat(data.values(), ignore_index=False)
all_data = all_data.reset_index()
all_data = all_data.rename(columns={'index': 'Date'})

print(f"Donn√©es totales: {all_data.shape}")
print(f"P√©riode: {all_data['Date'].min()} √† {all_data['Date'].max()}")
print(f"Symboles: {all_data['Symbol'].unique()}")
all_data.head()

In [None]:
# Statistiques descriptives
print("Statistiques descriptives:")
all_data[['Open', 'High', 'Low', 'Close', 'Volume']].describe()

In [None]:
# Visualisation des prix de cl√¥ture
fig = px.line(all_data, x='Date', y='Close', color='Symbol', 
              title='Prix de Cl√¥ture des Actions')
fig.show()

In [None]:
# Analyse du volume
fig = px.bar(all_data, x='Date', y='Volume', color='Symbol', 
             title='Volume des Transactions')
fig.show()

In [None]:
# Calcul des rendements
all_data['Returns'] = all_data.groupby('Symbol')['Close'].pct_change()

# Distribution des rendements
fig = px.histogram(all_data, x='Returns', color='Symbol', 
                   title='Distribution des Rendements')
fig.show()

In [None]:
# Analyse de corr√©lation
pivot_data = all_data.pivot(index='Date', columns='Symbol', values='Close')
correlation_matrix = pivot_data.pct_change().corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matrice de Corr√©lation des Prix de Cl√¥ture')
plt.show()

In [None]:
# Analyse de volatilit√©
all_data['Volatility'] = all_data.groupby('Symbol')['Returns'].rolling(window=30).std().reset_index(0, drop=True)

fig = px.line(all_data, x='Date', y='Volatility', color='Symbol', 
              title='Volatilit√© (30 jours)')
fig.show()

In [None]:
# D√©tection d'anomalies avec Z-score
def detect_anomalies_zscore(df, symbol, threshold=2.0):
    """D√©tecter les anomalies avec Z-score"""
    symbol_data = df[df['Symbol'] == symbol].copy()
    
    # Calculer les statistiques glissantes
    symbol_data['Rolling_Mean'] = symbol_data['Close'].rolling(window=30, min_periods=1).mean()
    symbol_data['Rolling_Std'] = symbol_data['Close'].rolling(window=30, min_periods=1).std()
    
    # Calculer le Z-score
    symbol_data['Z_Score'] = (symbol_data['Close'] - symbol_data['Rolling_Mean']) / symbol_data['Rolling_Std']
    
    # Identifier les anomalies
    symbol_data['Is_Anomaly'] = symbol_data['Z_Score'].abs() > threshold
    
    return symbol_data

# D√©tecter les anomalies pour chaque symbole
anomaly_data = {}
for symbol in symbols:
    anomaly_data[symbol] = detect_anomalies_zscore(all_data, symbol)
    anomaly_count = anomaly_data[symbol]['Is_Anomaly'].sum()
    print(f"Anomalies d√©tect√©es pour {symbol}: {anomaly_count}")

In [None]:
# Visualiser les anomalies pour un symbole
symbol = "AAPL"
symbol_df = anomaly_data[symbol]

# Cr√©er un graphique avec les anomalies mises en √©vidence
fig = go.Figure()

# Prix de cl√¥ture
fig.add_trace(go.Scatter(x=symbol_df['Date'], y=symbol_df['Close'], 
                         mode='lines', name='Prix de cl√¥ture'))

# Moyenne mobile
fig.add_trace(go.Scatter(x=symbol_df['Date'], y=symbol_df['Rolling_Mean'], 
                         mode='lines', name='Moyenne mobile (30j)', line=dict(color='orange')))

# Anomalies
anomaly_points = symbol_df[symbol_df['Is_Anomaly']]
fig.add_trace(go.Scatter(x=anomaly_points['Date'], y=anomaly_points['Close'], 
                         mode='markers', name='Anomalies', 
                         marker=dict(color='red', size=8, symbol='x'),
                         text=anomaly_points['Z_Score'].round(2),
                         hovertemplate='Date: %{x}<br>Prix: %{y}<br>Z-Score: %{text}<extra></extra>'))

fig.update_layout(title=f'Prix de cl√¥ture avec d√©tection d\'anomalies - {symbol}', 
                  xaxis_title='Date', yaxis_title='Prix')
fig.show()

In [None]:
# Autres m√©thodes de d√©tection d'anomalies
def detect_anomalies_iqr(df, symbol):
    """D√©tecter les anomalies avec la m√©thode IQR"""
    symbol_data = df[df['Symbol'] == symbol].copy()
    
    # Calculer les quartiles
    Q1 = symbol_data['Close'].quantile(0.25)
    Q3 = symbol_data['Close'].quantile(0.75)
    IQR = Q3 - Q1
    
    # D√©finir les bornes
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identifier les anomalies
    symbol_data['Is_Anomaly_IQR'] = (symbol_data['Close'] < lower_bound) | (symbol_data['Close'] > upper_bound)
    
    return symbol_data

# Comparer les m√©thodes de d√©tection d'anomalies
for symbol in symbols:
    iqr_data = detect_anomalies_iqr(all_data, symbol)
    zscore_count = anomaly_data[symbol]['Is_Anomaly'].sum()
    iqr_count = iqr_data['Is_Anomaly_IQR'].sum()
    print(f"{symbol}: {zscore_count} anomalies (Z-score), {iqr_count} anomalies (IQR)")

In [None]:
# Pr√©paration des donn√©es pour l'entra√Ænement ML avec indicateurs d'anomalie
def prepare_ml_data_with_anomalies(df, symbol):
    """Pr√©parer les donn√©es pour l'entra√Ænement ML avec indicateurs d'anomalie"""
    symbol_data = df[df['Symbol'] == symbol].copy()
    
    # Calculer des indicateurs techniques
    symbol_data['SMA_20'] = symbol_data['Close'].rolling(window=20).mean()
    symbol_data['SMA_50'] = symbol_data['Close'].rolling(window=50).mean()
    symbol_data['RSI'] = calculate_rsi(symbol_data['Close'])
    symbol_data['MACD'] = calculate_macd(symbol_data['Close'])
    
    # Indicateurs d'anomalie
    symbol_data['Rolling_Mean'] = symbol_data['Close'].rolling(window=30).mean()
    symbol_data['Rolling_Std'] = symbol_data['Close'].rolling(window=30).std()
    symbol_data['Z_Score'] = (symbol_data['Close'] - symbol_data['Rolling_Mean']) / symbol_data['Rolling_Std']
    symbol_data['Is_Anomaly'] = symbol_data['Z_Score'].abs() > 2.0
    
    # D√©caler les prix pour cr√©er la cible
    symbol_data['Target'] = symbol_data['Close'].shift(-1)  # Prix de demain
    
    # Supprimer les valeurs manquantes
    symbol_data = symbol_data.dropna()
    
    return symbol_data

def calculate_rsi(prices, window=14):
    """Calculer l'indicateur RSI"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculer l'indicateur MACD"""
    exp1 = prices.ewm(span=fast, adjust=False).mean()
    exp2 = prices.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    return macd

# Pr√©parer les donn√©es pour chaque symbole
ml_data = {}
for symbol in symbols:
    ml_data[symbol] = prepare_ml_data_with_anomalies(all_data, symbol)
    anomaly_count = ml_data[symbol]['Is_Anomaly'].sum()
    print(f"Donn√©es ML pour {symbol}: {ml_data[symbol].shape}, Anomalies: {anomaly_count}")

In [None]:
# Visualiser les indicateurs techniques pour un symbole
symbol = "AAPL"
symbol_df = ml_data[symbol]

fig = go.Figure()
fig.add_trace(go.Scatter(x=symbol_df['Date'], y=symbol_df['Close'], name='Close'))
fig.add_trace(go.Scatter(x=symbol_df['Date'], y=symbol_df['SMA_20'], name='SMA 20'))
fig.add_trace(go.Scatter(x=symbol_df['Date'], y=symbol_df['SMA_50'], name='SMA 50'))

# Mettre en √©vidence les anomalies
anomaly_points = symbol_df[symbol_df['Is_Anomaly']]
fig.add_trace(go.Scatter(x=anomaly_points['Date'], y=anomaly_points['Close'], 
                         mode='markers', name='Anomalies', 
                         marker=dict(color='red', size=8, symbol='x')))

fig.update_layout(title=f'Prix et Moyennes Mobiles - {symbol} (avec anomalies)', 
                  xaxis_title='Date', yaxis_title='Prix')
fig.show()

In [None]:
# Sauvegarder les donn√©es pr√©par√©es
for symbol, df in ml_data.items():
    df.to_csv(f'data/processed/{symbol}_ml_data.csv', index=False)
    anomaly_count = df['Is_Anomaly'].sum()
    print(f"Donn√©es ML sauvegard√©es pour {symbol} (Anomalies: {anomaly_count})")

## Conclusion

Ce notebook a explor√© les donn√©es financi√®res, d√©tect√© des anomalies en utilisant plusieurs m√©thodes (Z-score, IQR) et pr√©par√© les donn√©es pour l'entra√Ænement des mod√®les ML. Les prochaines √©tapes sont:

1. Entra√Æner les mod√®les de pr√©diction de prix (LSTM) avec d√©tection d'anomalies
2. Entra√Æner les mod√®les d'analyse de sentiment (FinBERT)
3. Cr√©er un mod√®le hybride qui combine les signaux de prix et de sentiment
4. Tester les mod√®les et √©valuer leurs performances dans la d√©tection d'anomalies