In [0]:
df = spark.read.format("delta").load('/mnt/databricks/kafka/transactions')
df = df.withColumnRenamed("E", "event_time") \
       .withColumnRenamed("s", "symbol") \
       .withColumnRenamed("t_trade", "trade_id") \
       .withColumnRenamed("p", "price") \
       .withColumnRenamed("q", "quantity") \
       .withColumnRenamed("T", "trade_time") \
       .withColumnRenamed("m", "buyer_is_maker") \
       .withColumnRenamed("M_flag", "ignore_flag")
df.display()


In [0]:
from pyspark.sql.functions import col, when, lag, unix_timestamp, to_timestamp, hour
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# price et quantity en float (était en string)
df = df.withColumn("price", col("price").cast("double")) \
       .withColumn("quantity", col("quantity").cast("double"))

# Calculer le montant total
df = df.withColumn("amount", col("price") * col("quantity"))

# Convertir en timestamp Spark
df = df.withColumn("trade_time_ts", to_timestamp((col("trade_time") / 1000).cast("long")))

# Extraire l'heure du trade
df = df.withColumn("hour_of_day", hour(col("trade_time_ts")))

# Différence de prix par rapport à la transaction précédente (souvent 0)
window = Window.orderBy("trade_time")
df = df.withColumn("prev_price", lag("price").over(window))
df = df.withColumn("price_diff", col("price") - col("prev_price"))

#transformer colonne booléenne en numérique 0/1
df = df.withColumn("buyer_is_maker_num", when(col("buyer_is_maker") == True, 1).otherwise(0))

#Toutes les colonnes à prendre pour le modèle
features_cols = [
    "price",
    "quantity",
    "buyer_is_maker_num",
    "amount",
    "hour_of_day",
    "prev_price",
    "price_diff"
]

# Créer un vecteur de features avec VectorAssembler
assembler = VectorAssembler(inputCols=features_cols, outputCol="features", handleInvalid="skip")

# créer un df de features
df_features = assembler.transform(df).select("features")

# on affiche pour voir si ça a marché correctement
df_features.display()

In [0]:
from synapse.ml.isolationforest import IsolationForest
import mlflow
import mlflow.spark
results_path = "/mnt/databricks/delta/anomaly_results"
model_path = "/mnt/databricks/models/my_model"

with mlflow.start_run():

    # Logging des hyperparamètres
    mlflow.log_param("featuresCol", "features")
    mlflow.log_param("predictionCol", "anomaly_prediction")
    mlflow.log_param("scoreCol", "anomaly_score")
    mlflow.log_param("contamination", 0.01)
    mlflow.log_param("maxSamples", 1.0)

    # Création et entraînement du modèle Isolation Forest
    isolation_forest = IsolationForest() \
        .setFeaturesCol("features") \
        .setPredictionCol("anomaly_prediction") \
        .setScoreCol("anomaly_score") \
        .setContamination(0.01) \
        .setMaxSamples(1.0)

    model = isolation_forest.fit(df_features)

    # Application du modèle
    result = model.transform(df_features)

    result.write.format("delta").mode("overwrite").save(results_path)

    # taux d’anomalie détecté
    anomaly_ratio = result.filter(result.anomaly_prediction == 1).count() / result.count()
    mlflow.log_metric("anomaly_ratio", anomaly_ratio)

    # Log du modele dans MLflow
    mlflow.spark.log_model(model, "isolation_forest_model")



In [0]:
# Sauvegarde du modèle
model_path = "/mnt/databricks/models/my_model"

model.write().overwrite().save(model_path)

In [0]:
# Affichage résultats

df = spark.read.format("delta").load(results_path)
df.show()