In [1]:
# ðŸ”¹ Imports et initialisation
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
import numpy as np
import pandas as pd

# CrÃ©er SparkSession
spark = SparkSession.builder \
    .appName("Fraud-Data-Cleaning") \
    .getOrCreate()

spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/05 00:30:45 WARN Utils: Your hostname, TUF-GAMING-FX504GD, resolves to a loopback address: 127.0.1.1; using 192.168.1.145 instead (on interface wlo1)
26/01/05 00:30:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/05 00:30:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [74]:
# ðŸ”¹ Charger le dataset depuis HDFS
df = spark.read.csv(
    "hdfs:///user/hadoop/BigDataFraude_ML-GraphX/creditcard.csv",
    header=True,
    inferSchema=True
)

df.cache()

26/01/04 22:54:47 WARN CacheManager: Asked to cache already cached data.        


DataFrame[Time: double, V1: double, V2: double, V3: double, V4: double, V5: double, V6: double, V7: double, V8: double, V9: double, V10: double, V11: double, V12: double, V13: double, V14: double, V15: double, V16: double, V17: double, V18: double, V19: double, V20: double, V21: double, V22: double, V23: double, V24: double, V25: double, V26: double, V27: double, V28: double, Amount: double, Class: int]

In [75]:
# ðŸ”¹ AperÃ§u du dataset
df.printSchema()
df.show(5)
df.count()

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

284807

In [76]:
# ðŸ”¹ VÃ©rification des valeurs nulles
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show(truncate=False)

+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|Time|V1 |V2 |V3 |V4 |V5 |V6 |V7 |V8 |V9 |V10|V11|V12|V13|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|V26|V27|V28|Amount|Class|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0     |0    |
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+



In [77]:
# ðŸ”¹ Distribution des classes
df.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [78]:
# ðŸ”¹ Statistiques descriptives pour 'Amount'
df.select("Amount").describe().show()

+-------+------------------+
|summary|            Amount|
+-------+------------------+
|  count|            284807|
|   mean| 88.34961925093077|
| stddev|250.12010924018867|
|    min|               0.0|
|    max|          25691.16|
+-------+------------------+



In [79]:
# ðŸ”¹ SÃ©lection des features
feature_cols = [c for c in df.columns if c != "Class"]
feature_cols

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount']

In [80]:
# ðŸ”¹ Vectorisation avec Spark ML
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_ml = assembler.transform(df).select("features", col("Class").alias("label"))
df_ml.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,-1.359807133...|    0|
|[0.0,1.1918571113...|    0|
|[1.0,-1.358354061...|    0|
|[1.0,-0.966271711...|    0|
|[2.0,-1.158233093...|    0|
+--------------------+-----+
only showing top 5 rows


In [81]:
# ðŸ”¹ Split Train / Test
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)
print("Train size:", train_df.count())
print("Test size:", test_df.count())

Train size: 228225
Test size: 56582


In [82]:
# ðŸ”¹ Sauvegarde Parquet dans HDFS
train_df.write.mode("overwrite").parquet(
    "hdfs:///user/hadoop/BigDataFraude_ML-GraphX/train"
)

test_df.write.mode("overwrite").parquet(
    "hdfs:///user/hadoop/BigDataFraude_ML-GraphX/test"
)

26/01/04 22:54:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
26/01/04 22:54:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
26/01/04 22:54:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
26/01/04 22:54:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
26/01/04 22:54:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
26/01/04 22:54:50 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
26/01/04 22:54:50 WARN MemoryManager: Total allocation exceeds 95.

In [83]:
# ðŸ”¹ VÃ©rification rapide
spark.read.parquet("hdfs:///user/hadoop/BigDataFraude_ML-GraphX/train").count()

228225