# Notebook 3 - Machine Learning Fraud Detection
Ã‰tape par Ã©tape avec PySpark ML

In [ ]:
# ðŸ”¹ Ã‰tape 1 : Initialisation Spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder \
    .appName("Fraud-ML") \
    .getOrCreate()

In [ ]:
# ðŸ”¹ Ã‰tape 2 : Charger le dataset train et test
train_df = spark.read.parquet("hdfs:///user/hadoop/BigDataFraude_ML-GraphX/train")
test_df  = spark.read.parquet("hdfs:///user/hadoop/BigDataFraude_ML-GraphX/test")

In [ ]:
# ðŸ”¹ Ã‰tape 3 : PrÃ©parer les features
feature_cols = [c for c in train_df.columns if c != "Class"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_ml = assembler.transform(train_df).select("features", col("Class").alias("label"))
test_ml = assembler.transform(test_df).select("features", col("Class").alias("label"))

In [ ]:
# ðŸ”¹ Ã‰tape 4 : CrÃ©ation et entraÃ®nement du modÃ¨le
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
lr_model = lr.fit(train_ml)

In [ ]:
# ðŸ”¹ Ã‰tape 5 : PrÃ©dictions sur le test set
predictions = lr_model.transform(test_ml)
predictions.select("features", "label", "prediction", "probability").show(5)

In [ ]:
# ðŸ”¹ Ã‰tape 6 : Ã‰valuation du modÃ¨le
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print("ROC AUC:", roc_auc)