# 1. Entrainement + Score / Matrice de confusion 

In [23]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import os

# Répertoire pour sauvegarder les modèles ONNX
onnx_dir = os.path.dirname(os.getcwd()) + "/models"
os.makedirs(onnx_dir, exist_ok=True)

# Sauvegarder les modèles ONNX
def save_model_onnx(name, model, path, initial_type):
    # initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
    onnx_model = convert_sklearn(model, initial_types=initial_type)
    onnx_path = os.path.join(path, f"{name}.onnx")
    with open(onnx_path, "wb") as f:
        f.write(onnx_model.SerializeToString())
    print(f"Model {name} saved to {onnx_path}")

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from skl2onnx.common.data_types import FloatTensorType
import os 

# ------------------------------------------------------------------
# 1. Paramètres généraux
# ------------------------------------------------------------------
TICKERS = ["AAPL", "MSFT", "GOOGL"]
LOOK_AHEAD = 5          # horizon de prédiction (jours)
TRAIN_RATIO = 0.80      # fraction chronologique pour le train-set
SEED = 42

# Préparer les répertoires de sauvegarde
prepared_data_path  = os.path.dirname(os.getcwd()) + "/data/modelisation/prepared_data"
results_path        = os.path.dirname(os.getcwd()) + "/data/modelisation/results"
training_data_path  = os.path.dirname(os.getcwd()) + "/data/modelisation/training_data"
os.makedirs(prepared_data_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)
os.makedirs(training_data_path, exist_ok=True)

# ------------------------------------------------------------------
# 2. Fonction utilitaire : features + label
# ------------------------------------------------------------------
def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Label : prix dans 5 jours supérieur au prix courant ?
    df["future_close"] = df["Close"].shift(-LOOK_AHEAD)
    df["target"] = (df["future_close"] > df["Close"]).astype(int)

    # Features (exemples simples, extensibles) ────────────────
    df["ret1"]  = np.log(df["Close"] / df["Close"].shift(1))
    df["ret3"]  = np.log(df["Close"] / df["Close"].shift(3))
    df["sma5"]  = df["Close"].rolling(5).mean() / df["Close"] - 1
    df["vol5"]  = df["ret1"].rolling(5).std()

    df = df.dropna().reset_index(drop=True)  # retirer NaN
    return df[["Date", "target", "ret1", "ret3", "sma5", "vol5"]]

# ------------------------------------------------------------------
# 3. Modèles à tester
#    (la logistique bénéficie d’une standardisation automatique)
# ------------------------------------------------------------------
MODELS = {
    "Logistic": make_pipeline(StandardScaler(),
                              LogisticRegression(random_state=SEED)),
    "DecisionTree": DecisionTreeClassifier(random_state=SEED),
    "RandomForest": RandomForestClassifier(random_state=SEED),
    "GradientBoosting": GradientBoostingClassifier(random_state=SEED),
}


# ------------------------------------------------------------------
# 4. Boucle principale par ticker
# ------------------------------------------------------------------
results_df = pd.DataFrame(columns=["ticker", "model", "accuracy", "precision", "recall", "f1_score"])
for tic in TICKERS:
    print(f"\n========================  {tic}  ========================\n")
    # 4-a. Télécharger et préparer
    price = yf.download(tic, start="2015-01-01", progress=False)
    price = price.reset_index()                # yfinance retourne DatetimeIndex
    price = price.droplevel(1, axis=1)  # supprimer le multi-index
    data  = prepare_data(price)

    # Save data to CSV for later use
    data.to_csv(f"{prepared_data_path}/{tic}_prepared.csv", index=False)
    X = data[["ret1", "ret3", "sma5", "vol5"]]
    y = data["target"].values
    
    # 4-b. Split chronologique
    cut = int(len(data) * TRAIN_RATIO)
    X_train, X_test = X.iloc[:cut], X.iloc[cut:]
    y_train, y_test = y[:cut], y[cut:]

    # Saving train and test data
    X_train.to_csv(f"{training_data_path}/{tic}_X_train.csv", index=False)
    X_test.to_csv(f"{training_data_path}/{tic}_X_test.csv", index=False)
    y_train = pd.DataFrame(y_train, columns=["target"])
    y_test = pd.DataFrame(y_test, columns=["target"])

    # 4-c. Entraînement + évaluation pour chaque modèle
    for name, model in MODELS.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        acc  = (tp + tn) / cm.sum()
        prec = tp / (tp + fp) if (tp + fp) else 0
        rec  = tp / (tp + fn) if (tp + fn) else 0
        f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0

        print(f"--- {name} ---")
        print("Matrice de confusion\n", cm)
        print(f"Accuracy  : {acc:.3f}")
        print(f"Precision : {prec:.3f} | Recall : {rec:.3f} | F1 : {f1:.3f}\n")
        
        # Save results to DataFrame
        results_df = pd.concat([results_df, pd.DataFrame({
            "ticker": tic,
            "model": name,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        }, index=[0])], ignore_index=True)

        # Save model
        initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
        save_model_onnx(name, model, onnx_dir, initial_type)

# Save results to CSV
results_df.to_csv(f"{results_path}/results.csv", index=False)





  y = column_or_1d(y, warn=True)
  results_df = pd.concat([results_df, pd.DataFrame({
  return fit_method(estimator, *args, **kwargs)


--- Logistic ---
Matrice de confusion
 [[  4 224]
 [  5 285]]
Accuracy  : 0.558
Precision : 0.560 | Recall : 0.983 | F1 : 0.713

Model Logistic saved to /Users/davidzhu/Local/gestion-quantitative/models/Logistic.onnx
--- DecisionTree ---
Matrice de confusion
 [[ 41 187]
 [ 45 245]]
Accuracy  : 0.552
Precision : 0.567 | Recall : 0.845 | F1 : 0.679

Model DecisionTree saved to /Users/davidzhu/Local/gestion-quantitative/models/DecisionTree.onnx
--- RandomForest ---
Matrice de confusion
 [[  8 220]
 [ 16 274]]
Accuracy  : 0.544
Precision : 0.555 | Recall : 0.945 | F1 : 0.699

Model RandomForest saved to /Users/davidzhu/Local/gestion-quantitative/models/RandomForest.onnx


  y = column_or_1d(y, warn=True)


--- GradientBoosting ---
Matrice de confusion
 [[ 25 203]
 [ 38 252]]
Accuracy  : 0.535
Precision : 0.554 | Recall : 0.869 | F1 : 0.677

Model GradientBoosting saved to /Users/davidzhu/Local/gestion-quantitative/models/GradientBoosting.onnx




  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


--- Logistic ---
Matrice de confusion
 [[  1 223]
 [  2 292]]
Accuracy  : 0.566
Precision : 0.567 | Recall : 0.993 | F1 : 0.722

Model Logistic saved to /Users/davidzhu/Local/gestion-quantitative/models/Logistic.onnx
--- DecisionTree ---
Matrice de confusion
 [[ 37 187]
 [ 45 249]]
Accuracy  : 0.552
Precision : 0.571 | Recall : 0.847 | F1 : 0.682

Model DecisionTree saved to /Users/davidzhu/Local/gestion-quantitative/models/DecisionTree.onnx
--- RandomForest ---
Matrice de confusion
 [[  9 215]
 [ 20 274]]
Accuracy  : 0.546
Precision : 0.560 | Recall : 0.932 | F1 : 0.700

Model RandomForest saved to /Users/davidzhu/Local/gestion-quantitative/models/RandomForest.onnx


  y = column_or_1d(y, warn=True)


--- GradientBoosting ---
Matrice de confusion
 [[ 29 195]
 [ 43 251]]
Accuracy  : 0.541
Precision : 0.563 | Recall : 0.854 | F1 : 0.678

Model GradientBoosting saved to /Users/davidzhu/Local/gestion-quantitative/models/GradientBoosting.onnx




  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


--- Logistic ---
Matrice de confusion
 [[ 13 218]
 [  4 283]]
Accuracy  : 0.571
Precision : 0.565 | Recall : 0.986 | F1 : 0.718

Model Logistic saved to /Users/davidzhu/Local/gestion-quantitative/models/Logistic.onnx
--- DecisionTree ---
Matrice de confusion
 [[ 50 181]
 [ 64 223]]
Accuracy  : 0.527
Precision : 0.552 | Recall : 0.777 | F1 : 0.645

Model DecisionTree saved to /Users/davidzhu/Local/gestion-quantitative/models/DecisionTree.onnx
--- RandomForest ---
Matrice de confusion
 [[ 31 200]
 [ 28 259]]
Accuracy  : 0.560
Precision : 0.564 | Recall : 0.902 | F1 : 0.694

Model RandomForest saved to /Users/davidzhu/Local/gestion-quantitative/models/RandomForest.onnx


  y = column_or_1d(y, warn=True)


--- GradientBoosting ---
Matrice de confusion
 [[ 48 183]
 [ 53 234]]
Accuracy  : 0.544
Precision : 0.561 | Recall : 0.815 | F1 : 0.665

Model GradientBoosting saved to /Users/davidzhu/Local/gestion-quantitative/models/GradientBoosting.onnx


# 2. Loading du modèle + inférence

In [None]:
import onnxruntime as ort
import numpy as np

def predict_onnx(model_path, X_test):
    # Load the ONNX model
    onnx_session = ort.InferenceSession(model_path)

    # Prepare input data for inference
    input_name = onnx_session.get_inputs()[0].name
    output_name = onnx_session.get_outputs()[0].name

    # Perform inference
    predictions = onnx_session.run([output_name], {input_name: X_test})

    return predictions[0]

X_test_subset = X_test.to_numpy().astype(np.float32)[0:5]
display(predict_onnx(onnx_dir + "/Logistic.onnx", X_test_subset))
display(predict_onnx(onnx_dir + "/RandomForest.onnx", X_test_subset))
display(predict_onnx(onnx_dir + "/GradientBoosting.onnx", X_test_subset))
display(predict_onnx(onnx_dir + "/DecisionTree.onnx", X_test_subset))

array([1, 1, 1, 1, 1], dtype=int64)

array([1, 1, 0, 1, 0], dtype=int64)

array([1, 1, 0, 1, 1], dtype=int64)

array([1, 0, 0, 0, 1], dtype=int64)

# 3. Recherche d'hyperparamètres optimaux

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Définir les espaces d'hyperparamètres pour chaque modèle
# ------------------------------------------------------------------
param_grids = {
    "Logistic": {
        "model__penalty": ["l2"],
        "model__C": [0.1, 1, 10],
        "model__solver": ["lbfgs", "liblinear"],
        "model__max_iter": [100, 500],
    },
    "DecisionTree": {
        "model__max_depth": [3, 5, 8],
        "model__min_samples_leaf": [1, 5, 10],
        "model__criterion": ["gini", "entropy"],
    },
    "RandomForest": {
        "model__n_estimators": [200, 500],
        "model__max_depth": [None, 10],
        "model__min_samples_leaf": [1, 3],
        "model__bootstrap": [True, False],
    },
    "GradientBoosting": {
        "model__n_estimators": [100, 300, 500],
        "model__learning_rate": [0.05, 0.1, 0.2],
        "model__max_depth": [2, 3, 4],
        # "model__min_samples_leaf": [1, 3],
        # "model__subsample": [0.8, 1.0],
        # "model__max_features": ["sqrt", "log2"],
    },
}
# ------------------------------------------------------------------
# 2. Fonction utilitaire : créer un pipeline avec standardisation
# ------------------------------------------------------------------
def create_pipeline(model):
    if model == "Logistic":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("model", LogisticRegression(max_iter=1000, random_state=SEED))
        ])
    elif model == "DecisionTree":
        return Pipeline([
            ("model", DecisionTreeClassifier(random_state=SEED))
        ])
    elif model == "RandomForest":
        return Pipeline([
            ("model", RandomForestClassifier(random_state=SEED))
        ])
    elif model == "GradientBoosting":
        return Pipeline([
            ("model", GradientBoostingClassifier(random_state=SEED))
        ])
    else:
        raise ValueError(f"Unknown model: {model}")
# ------------------------------------------------------------------
# 3. Boucle principale par ticker
# ------------------------------------------------------------------
results_df = pd.DataFrame(columns=["ticker", "model", "accuracy", "precision", "recall", "f1_score"])
for tic in TICKERS:
    print(f"\n========================  {tic}  ========================\n")
    # 3-a. Télécharger et préparer
    price = yf.download(tic, start="2015-01-01", progress=False)
    price = price.reset_index()                # yfinance retourne DatetimeIndex
    price = price.droplevel(1, axis=1)  # supprimer le multi-index
    data  = prepare_data(price)

    # Save data to CSV for later use
    data.to_csv(f"{prepared_data_path}/{tic}_prepared.csv", index=False)
    X = data[["ret1", "ret3", "sma5", "vol5"]]
    y = data["target"].values
    
    # 3-b. Split chronologique
    cut = int(len(data) * TRAIN_RATIO)
    X_train, X_test = X.iloc[:cut], X.iloc[cut:]
    y_train, y_test = y[:cut], y[cut:]

    # Saving train and test data
    X_train.to_csv(f"{training_data_path}/{tic}_X_train.csv", index=False)
    X_test.to_csv(f"{training_data_path}/{tic}_X_test.csv", index=False)
    y_train = pd.DataFrame(y_train, columns=["target"])
    y_test = pd.DataFrame(y_test, columns=["target"])

    # 3-c. Entraînement + évaluation pour chaque modèle
    for name in MODELS.keys():
        print(f"--- {name} ---")
        model_pipeline = create_pipeline(name)

        # GridSearchCV pour l'optimisation des hyperparamètres
        grid_search = GridSearchCV(
            model_pipeline,
            param_grids[name],
            scoring=make_scorer(accuracy_score),
            cv=5,
            n_jobs=-1,
            verbose=1,
        )

        grid_search.fit(X_train, y_train.values.ravel())

        # Conserver uniquement le meilleur modèle
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        acc  = (tp + tn) / cm.sum()
        prec = tp / (tp + fp) if (tp + fp) else 0
        rec  = tp / (tp + fn) if (tp + fn) else 0
        f1  = 2 * prec * rec / (prec + rec) if (prec + rec) else 0

        # Save results to DataFrame
        results_df = pd.concat([results_df, pd.DataFrame({
            "ticker": tic,
            "model": name,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        }, index=[0])], ignore_index=True)

        # Save model
        initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
        save_model_onnx(name, best_model, onnx_dir, initial_type)

        # Print les meilleurs hyperparamètres
        print("Best hyperparameters:", grid_search.best_params_)
        print("Best score:", grid_search.best_score_)
        
# Save results to CSV
results_df.to_csv(f"{results_path}/results_optimized.csv", index=False)



--- Logistic ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  results_df = pd.concat([results_df, pd.DataFrame({


Model Logistic saved to /Users/davidzhu/Local/gestion-quantitative/models/Logistic.onnx
--- DecisionTree ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Model DecisionTree saved to /Users/davidzhu/Local/gestion-quantitative/models/DecisionTree.onnx
--- RandomForest ---
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Model RandomForest saved to /Users/davidzhu/Local/gestion-quantitative/models/RandomForest.onnx
--- GradientBoosting ---
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Model GradientBoosting saved to /Users/davidzhu/Local/gestion-quantitative/models/GradientBoosting.onnx


--- Logistic ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Model Logistic saved to /Users/davidzhu/Local/gestion-quantitative/models/Logistic.onnx
--- DecisionTree ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Model DecisionTree saved to /Users/davidzhu/Local/gestion-quantitative/models/DecisionTree.onnx
--- RandomForest ---


In [24]:
# grid search.best_params_