# Notebook 3 - Machine Learning Fraud Detection
**Objectif :** Entra√Æner le GBTClassifier, mieux adapt√© aux donn√©es d√©s√©quilibr√©es, et utiliser l'AUC-PR pour une √©valuation pertinente.

In [2]:
# üîπ √âtape 1 : Initialisation Spark et Imports
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier # Changement : GBTClassifier au lieu de LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder \
    .appName("Fraud-ML") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/05 01:43:49 WARN Utils: Your hostname, TUF-GAMING-FX504GD, resolves to a loopback address: 127.0.1.1; using 192.168.1.145 instead (on interface wlo1)
26/01/05 01:43:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/05 01:43:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# üîπ √âtape 2 : Charger le dataset et effectuer le split train/test (CORRECTION)
print("--- Chargement et Split du CSV original ---")
# Rechargez le CSV pour √™tre s√ªr d'avoir toutes les colonnes (V1, V2, ..., Amount, Class, Time)
full_df = spark.read.csv("hdfs:///user/hadoop/BigDataFraude_ML-GraphX/creditcard.csv", header=True, inferSchema=True)

# Effectuer le split ici (remplace le chargement des fichiers Parquet train/test)
# Ratio typique 80/20 ou 70/30. Utilisez un seed pour la reproductibilit√©.
train_df, test_df = full_df.randomSplit([0.7, 0.3], seed=42)

--- Chargement et Split du CSV original ---


                                                                                

In [4]:
# üîπ √âtape 3 : Pr√©parer les features (Correction pour Robustesse)
feature_cols = [c for c in train_df.columns if c not in ["Class", "Time"]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# --- AJOUT DE LA CORRECTION ---
# Si le DataFrame a d√©j√† une colonne 'features' (suite √† une r√©ex√©cution), supprimez-la.
if "features" in train_df.columns:
    print("La colonne 'features' existe d√©j√† dans train_df. Suppression...")
    train_df = train_df.drop("features")
if "features" in test_df.columns:
    print("La colonne 'features' existe d√©j√† dans test_df. Suppression...")
    test_df = test_df.drop("features")
# ------------------------------

train_ml = assembler.transform(train_df).select("features", col("Class").alias("label"))
test_ml = assembler.transform(test_df).select("features", col("Class").alias("label"))

print("Assemblage des features effectu√© avec succ√®s.")

Assemblage des features effectu√© avec succ√®s.


In [5]:
# üîπ √âtape 4 : Cr√©ation et entra√Ænement du mod√®le GBTClassifier
print("--- Entra√Ænement du GBTClassifier ---")
gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=10)
gbt_model = gbt.fit(train_ml)

--- Entra√Ænement du GBTClassifier ---


26/01/05 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [6]:
# üîπ √âtape 5 : Pr√©dictions sur le test set
predictions = gbt_model.transform(test_ml)
print("--- Aper√ßu des pr√©dictions ---")
predictions.select("features", "label", "prediction", "probability").show(5, truncate=False)

--- Aper√ßu des pr√©dictions ---
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------+----------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                 

In [7]:
# üîπ √âtape 6 : √âvaluation du mod√®le (AUC-ROC et AUC-PR)
print("--- √âvaluation des M√©triques ---")

# 1. AUC-ROC
evaluator_roc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator_roc.evaluate(predictions)
print(f"ROC AUC: {roc_auc:.4f}")

# 2. AUC-PR (Plus pertinent pour l'imbalance)
evaluator_pr = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
pr_auc = evaluator_pr.evaluate(predictions)
print(f"PR AUC: {pr_auc:.4f}")

--- √âvaluation des M√©triques ---


                                                                                

ROC AUC: 0.9414


[Stage 118:====>                                                  (1 + 11) / 12]

PR AUC: 0.7156


                                                                                

In [14]:
# Cellule de Sauvegarde
from pyspark.ml import Pipeline
import time

# S'assurer que le PipelineModel est frais
pipeline = Pipeline(stages=[assembler, gbt_model])
pipeline_model = pipeline.fit(train_df) 

# Utiliser un timestamp pour garantir un chemin UNIQUE, ce qui permet d'√©viter .overwrite()
timestamp = int(time.time())
MODEL_SAVE_PATH = f"file:///tmp/spark_models/fraude_gbt_final_{timestamp}"

print(f"--- Sauvegarde du mod√®le LOCALE dans : {MODEL_SAVE_PATH} ---")

try:
    # Sauvegarde 
    pipeline_model.save(MODEL_SAVE_PATH)
    print(" Sauvegarde r√©ussie (Chemin Garanti Unique).")
    
except Exception as e:
    print(f"√âCHEC CRITIQUE : {e}")
    raise e

# D√©finir le chemin final pour l'√©tape de streaming
FINAL_MODEL_PATH = MODEL_SAVE_PATH
print(f"\n Le chemin FINAL √† utiliser dans spark_streaming.py est : {FINAL_MODEL_PATH}")

--- Sauvegarde du mod√®le LOCALE dans : file:///tmp/spark_models/fraude_gbt_final_1767574836 ---
 Sauvegarde r√©ussie (Chemin Garanti Unique).

 Le chemin FINAL √† utiliser dans spark_streaming.py est : file:///tmp/spark_models/fraude_gbt_final_1767574836
