In [2]:
!pip install graphframes


Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m154.7/154.7 kB[0m [31m374.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [3]:
# Cellule 1: Initialisation Spark et Chargement
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as spark_sum, coalesce
from pyspark.sql.window import Window
from graphframes import GraphFrame

# Configuration Spark (pour l'environnement HDFS et GraphFrames)
spark = SparkSession.builder \
    .appName("Fraud-GraphX-Analysis-Final") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
    .getOrCreate()

# Chemin de base HDFS
hdfs_base_path = "hdfs://namenode:8020/fraud_data"

# Chargement du jeu de donn√©es original (comme confirm√© pr√©c√©demment)
print("--- Chargement des donn√©es originales ---")
try:
    df = spark.read.csv(
        f"{hdfs_base_path}/creditcard.csv", 
        header=True, 
        inferSchema=True
    )
    df.cache()
    print(f"Nombre total de transactions : {df.count()}")
    df.printSchema()
except Exception as e:
    print(f"ERREUR DE CHARGEMENT : {e}")
    # Ajoutez ici un chemin de secours si le HDFS n'est pas accessible

--- Chargement des donn√©es originales ---
Nombre total de transactions : 284807
root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: do

In [4]:
# Cellule 2/8 : Cr√©ation des Vertices (N≈ìuds)
print("--- Cr√©ation des Vertices (Clients V1) ---")

# Les N≈ìuds sont les identifiants V1 uniques
vertices_final = df.select(
    col("V1").alias("id"),  
    col("Class").alias("label"),
    col("Amount")
).distinct()

# La colonne 'label' doit √™tre num√©rique (0 ou 1) pour les calculs.
vertices_final = vertices_final.withColumn("label", col("label").cast("int"))

# Le PageRank initial (sera r√©initialis√© plus tard)
vertices_final = vertices_final.withColumn("pagerank", lit(1.0))
vertices_final.cache()

print(f"Nombre de n≈ìuds uniques (V1) : {vertices_final.count()}")
vertices_final.show(5)

--- Cr√©ation des Vertices (Clients V1) ---
Nombre de n≈ìuds uniques (V1) : 275663
+----------------+-----+------+--------+
|              id|label|Amount|pagerank|
+----------------+-----+------+--------+
|-1.2768303373631|    0| 110.4|     1.0|
|1.21205680491093|    0|  2.28|     1.0|
|1.08102680841932|    0| 17.24|     1.0|
|1.49157444507907|    0|   2.0|     1.0|
|1.09337038677875|    0|  49.9|     1.0|
+----------------+-----+------+--------+
only showing top 5 rows



In [5]:
# Cellule 3/8 : Cr√©ation des Edges (Liens Frauduleux Co-occurrents)
print("--- Cr√©ation des Edges (Liens Frauduleux Co-occurrents) ---")

edges = df.alias("t1").join(
    df.alias("t2"),
    (
        # 1. Les deux sont des fraudes
        (col("t1.Class") == 1) & (col("t2.Class") == 1) &
        
        # 2. Elles se produisent dans la m√™me heure
        ((col("t1.Time") / 3600).cast("int") == (col("t2.Time") / 3600).cast("int")) &
        
        # 3. Ce sont des entit√©s/clients diff√©rents (src != dst)
        (col("t1.V1") != col("t2.V1"))
    )
).select(
    col("t1.V1").alias("src"),
    col("t2.V1").alias("dst")
).distinct()

edges.cache()
print(f"Nombre total d'ar√™tes (liens de fraude) : {edges.count()}")
edges.show(5)

--- Cr√©ation des Edges (Liens Frauduleux Co-occurrents) ---
Nombre total d'ar√™tes (liens de fraude) : 7016
+-----------------+-----------------+
|              src|              dst|
+-----------------+-----------------+
|0.314596589729515|-1.58550536691994|
|-4.72771265581559|-2.58961719821269|
|-16.5986647432584|-25.2663550194138|
|-19.8563223334433|  -27.84818067198|
|-2.78724793061533|  -27.84818067198|
+-----------------+-----------------+
only showing top 5 rows



In [6]:
# Cellule 4/8 : Cr√©ation du GraphFrame et Analyse Structurale

# 1. Conversion des IDs en String (Obligatoire pour GraphFrames)
print("--- V√©rification et Ajustement du Sch√©ma pour GraphFrame ---")
vertices = vertices_final.withColumn("id", col("id").cast("string"))
edges = edges.withColumn("src", col("src").cast("string"))\
             .withColumn("dst", col("dst").cast("string"))

# 2. Cr√©ation du GraphFrame
g = GraphFrame(vertices, edges)
print("\n‚úÖ GraphFrame cr√©√© avec succ√®s.")

# 3. Afficher les 5 n≈ìuds les plus connect√©s (Degr√©)
try:
    print("\nTop 5 des N≈ìuds (V1) les plus connect√©s (Degr√©) :")
    g.degrees.orderBy(col("degree").desc()).show(5)
except Exception as e:
    print(f"Avertissement: Le calcul des degr√©s a √©chou√© ({e}). Le GraphFrame n'est peut-√™tre pas enti√®rement fonctionnel.")

--- V√©rification et Ajustement du Sch√©ma pour GraphFrame ---

‚úÖ GraphFrame cr√©√© avec succ√®s.

Top 5 des N≈ìuds (V1) les plus connect√©s (Degr√©) :




+-----------------+------+
|               id|degree|
+-----------------+------+
|-10.6457996485752|    84|
|-5.31417320646342|    84|
|-16.5265065691231|    84|
|-2.14441147422114|    84|
|-3.14025953779538|    84|
+-----------------+------+
only showing top 5 rows



In [7]:
# Cellule 5/8 : Algorithme des Composantes Connexes (CC)

# D√©finir le r√©pertoire de checkpoint (obligatoire pour CC)
try:
    spark.sparkContext.setCheckpointDir("/tmp/gftest_checkpoints")
    print("R√©pertoire de checkpoint d√©fini.")
except Exception as e:
    print(f"Avertissement: Impossible de d√©finir le checkpoint ({e}). L'algorithme CC peut √©chouer.")
    
print("\n--- Calcul des Composantes Connexes ---")

cc_results = g.connectedComponents().persist()

# Analyse des R√©sultats
component_sizes = cc_results.groupBy("component").count().withColumnRenamed("count", "componentSize")
cc_analysis = cc_results.join(component_sizes, "component")

# Afficher les statistiques du plus grand groupe
top_component = cc_analysis.orderBy(col("componentSize").desc()).limit(1).collect()[0]
top_component_id = top_component["component"]
total_size = top_component["componentSize"]

print(f"\nStatistiques pour la Plus Grande Composante (ID: {top_component_id}):")
print(f"- Taille Totale : {total_size}")
print(f"- Composition (Label 1) : {cc_analysis.filter(col('component') == top_component_id).filter(col('label') == 1).count()} (Puret√© : {cc_analysis.filter(col('component') == top_component_id).filter(col('label') == 1).count() / total_size * 100:.2f}%)")


R√©pertoire de checkpoint d√©fini.

--- Calcul des Composantes Connexes ---

Statistiques pour la Plus Grande Composante (ID: 20584):
- Taille Totale : 43
- Composition (Label 1) : 43 (Puret√© : 100.00%)


In [8]:
# Cellule 6/8 : Pr√©paration des Poids PageRank (Normalisation)

print("--- Initialisation des Poids PageRank (Normalisation des Ar√™tes) ---")

# 1. Joindre 'edges' avec 'df' pour r√©cup√©rer 'Amount'
edges_with_amount = edges.alias("e").join(
    df.select(col("V1").cast("string").alias("src_v1"), col("Amount")).distinct(), 
    col("e.src") == col("src_v1"),
    "left"
).select(col("e.src"), col("e.dst"), col("Amount")).distinct()

# 2. Calculer la somme totale des montants sortants par source
sum_weights = edges_with_amount.groupBy("src").agg(spark_sum("Amount").alias("total_outgoing_amount"))

# 3. Normalisation (Weight = Amount / Total Outgoing)
edges_with_sum = edges_with_amount.join(sum_weights, "src")
normalized_edges = edges_with_sum.withColumn(
    "weight", col("Amount") / col("total_outgoing_amount")
).select(col("src"), col("dst"), col("weight"))

# 4. Finaliser l'initialisation du PageRank (1/N)
N_vertices = vertices.count()
# Utiliser le DataFrame 'vertices' ajust√© des √©tapes pr√©c√©dentes
vertices = vertices.withColumn("pagerank", lit(1.0 / N_vertices)).cache()

print(f"Le DataFrame 'normalized_edges' est pr√™t avec {normalized_edges.count()} ar√™tes.")
print(f"PageRank initial (1/N) : {1.0 / N_vertices:.8f}")

--- Initialisation des Poids PageRank (Normalisation des Ar√™tes) ---
Le DataFrame 'normalized_edges' est pr√™t avec 7016 ar√™tes.
PageRank initial (1/N) : 0.00000363


In [9]:
# üîπ Cellule 7/8 : Algorithme PageRank (Simulation Native PySpark)

from pyspark.sql.functions import sum, lit, col, coalesce # S'assurer que les fonctions sont bien import√©es

MAX_ITER = 5
RESET_PROBABILITY = 0.15 

print(f"\n--- PageRank Simul√© ({MAX_ITER} it√©rations) ---")

# 'vertices' contient id, label, Amount, pagerank initial. On commence avec id, label, pagerank
current_vertices = vertices.select("id", "label", "pagerank").cache()

for i in range(MAX_ITER):
    # 1. Calcul du Score de contribution (pr * weight)
    contributions = current_vertices.join(
        normalized_edges, current_vertices.id == normalized_edges.src
    ).withColumn(
        "contribution", col("pagerank") * col("weight")
    ).select(col("dst").alias("id"), "contribution")
    
    # 2. Agr√©gation des contributions
    new_pageranks = contributions.groupBy("id").agg(sum("contribution").alias("sum_contribution"))
    
    # 3. Application de la formule PageRank
    current_vertices = current_vertices.drop("pagerank")\
                                       .join(new_pageranks, "id", "left_outer")\
                                       .withColumn(
                                           "sum_contribution_clean", 
                                           coalesce(col("sum_contribution"), lit(0))
                                       )
    
    N = float(current_vertices.count())
    
    # PR(new) = (1 - alpha) * PR(contribution) + alpha / N
    current_vertices = current_vertices.withColumn(
        "pagerank", 
        lit(1.0 - RESET_PROBABILITY) * col("sum_contribution_clean") + lit(RESET_PROBABILITY / N)
    ).select("id", "label", "pagerank").cache()
    
    print(f"It√©ration {i+1} compl√©t√©e.")
    
# R√©int√©grer la colonne Amount pour l'affichage final, en s√©lectionnant et renommant les colonnes clairement
# *********************************** CORRECTION ICI *********************************
results = current_vertices.join(
    vertices.select(
        "id", 
        "Amount", 
        col("label").alias("original_label") # Renommer pour √©viter l'ambigu√Øt√©
    ), 
    "id", 
    "left"
).drop("label").withColumnRenamed("original_label", "label") # Supprimer le label ambigu et renommer le bon

results = results.orderBy(col("pagerank").desc())
print("\nPageRank Simulation termin√©e. R√©sultats pr√™ts pour l'affichage.")


--- PageRank Simul√© (5 it√©rations) ---
It√©ration 1 compl√©t√©e.
It√©ration 2 compl√©t√©e.
It√©ration 3 compl√©t√©e.
It√©ration 4 compl√©t√©e.
It√©ration 5 compl√©t√©e.

PageRank Simulation termin√©e. R√©sultats pr√™ts pour l'affichage.


In [12]:
# Cellule 8/8 : Affichage et Interpr√©tation des R√©sultats Finaux

# Renommage de la colonne pour la clart√©
final_results = results.withColumnRenamed("pagerank", "PR_Score")

# 1. Top 10 des Hubs Potentiels
print("\n--- Top 10 des Clients/Cartes (V1) ayant le PageRank le plus √©lev√© (Hubs potentiels) ---")
final_results.select("id", "label", "Amount", "PR_Score").show(10, truncate=False)

# 2. Cas potentiels de Mules (transactions l√©gitimes avec un PageRank √©lev√©)
print("\n--- Top 5 des Transactions L√©gitimes (Label 0) avec PageRank √©lev√© (Mules potentielles) ---")
final_results.filter(col("label") == 0) \
    .orderBy(col("PR_Score").desc()) \
    .show(5, truncate=False)

# 3. Conclusion des r√©sultats
print("\n--- Interpr√©tation Finale ---")
print(f"CC : {cc_analysis.select('componentSize').distinct().count()} groupes de fraude d√©tect√©s.")
print("PageRank : Les scores PageRank permettent de classer le risque au sein du groupe principal.")
print("La combinaison CC + PageRank fournit une vision compl√®te de l'organisation de la fraude.")


--- Top 10 des Clients/Cartes (V1) ayant le PageRank le plus √©lev√© (Hubs potentiels) ---
+-----------------+-----+------+--------------------+
|id               |label|Amount|PR_Score            |
+-----------------+-----+------+--------------------+
|-7.50392623748137|1    |12.31 |3.627617779680262E-6|
|-2.4340041522657 |1    |362.55|3.627617779680262E-6|
|-2.98646550822273|1    |1.79  |3.627617779680262E-6|
|-17.5375916846763|1    |9.82  |3.627617779680262E-6|
|1.22761441351286 |1    |98.01 |3.627617779680262E-6|
|1.37855899734127 |1    |0.76  |3.627617779680262E-6|
|0.432554461820961|1    |1.0   |3.627617779680262E-6|
|-2.78386548658584|1    |156.0 |3.627617779680262E-6|
|0.467991939825149|1    |120.54|3.627617779680262E-6|
|1.24384844934819 |1    |1.0   |3.627617779680262E-6|
+-----------------+-----+------+--------------------+
only showing top 10 rows


--- Top 5 des Transactions L√©gitimes (Label 0) avec PageRank √©lev√© (Mules potentielles) ---
+------------------+----------