In [1]:
!pip install graphframes
!pip install findspark

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m154.7/154.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [2]:
# Cellule 1: Initialisation Spark et Chargement
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as spark_sum, coalesce
from pyspark.sql.window import Window
from graphframes import GraphFrame

# Configuration Spark (pour l'environnement HDFS et GraphFrames)
spark = SparkSession.builder \
    .appName("Fraud-GraphX-Analysis-Final") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
    .getOrCreate()

# Chemin de base HDFS
hdfs_base_path = "hdfs://namenode:8020/data"

# Chargement du jeu de donn√©es original (comme confirm√© pr√©c√©demment)
print("--- Chargement des donn√©es originales ---")
try:
    df = spark.read.csv(
        f"{hdfs_base_path}/creditcard.csv", 
        header=True, 
        inferSchema=True
    )
    df.cache()
    print(f"Nombre total de transactions : {df.count()}")
    df.printSchema()
except Exception as e:
    print(f"ERREUR DE CHARGEMENT : {e}")
    # Ajoutez ici un chemin de secours si le HDFS n'est pas accessible

--- Chargement des donn√©es originales ---
Nombre total de transactions : 284807
root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: do

In [3]:
# Cellule 2/8 : Cr√©ation des Vertices (N≈ìuds)
print("--- Cr√©ation des Vertices (Clients V1) ---")

# Les N≈ìuds sont les identifiants V1 uniques
vertices_final = df.select(
    col("V1").alias("id"),  
    col("Class").alias("label"),
    col("Amount")
).distinct()

# La colonne 'label' doit √™tre num√©rique (0 ou 1) pour les calculs.
vertices_final = vertices_final.withColumn("label", col("label").cast("int"))

# Le PageRank initial (sera r√©initialis√© plus tard)
vertices_final = vertices_final.withColumn("pagerank", lit(1.0))
vertices_final.cache()

print(f"Nombre de n≈ìuds uniques (V1) : {vertices_final.count()}")
vertices_final.show(5)

--- Cr√©ation des Vertices (Clients V1) ---
Nombre de n≈ìuds uniques (V1) : 275663
+----------------+-----+------+--------+
|              id|label|Amount|pagerank|
+----------------+-----+------+--------+
|-1.2768303373631|    0| 110.4|     1.0|
|1.21205680491093|    0|  2.28|     1.0|
|1.08102680841932|    0| 17.24|     1.0|
|1.49157444507907|    0|   2.0|     1.0|
|1.09337038677875|    0|  49.9|     1.0|
+----------------+-----+------+--------+
only showing top 5 rows



In [4]:
# Cellule 3/8 : Cr√©ation des Edges (Liens Frauduleux Co-occurrents)
print("--- Cr√©ation des Edges (Liens Frauduleux Co-occurrents) ---")

edges = df.alias("t1").join(
    df.alias("t2"),
    (
        # 1. Les deux sont des fraudes
        (col("t1.Class") == 1) & (col("t2.Class") == 1) &
        
        # 2. Elles se produisent dans la m√™me heure
        ((col("t1.Time") / 3600).cast("int") == (col("t2.Time") / 3600).cast("int")) &
        
        # 3. Ce sont des entit√©s/clients diff√©rents (src != dst)
        (col("t1.V1") != col("t2.V1"))
    )
).select(
    col("t1.V1").alias("src"),
    col("t2.V1").alias("dst")
).distinct()

edges.cache()
print(f"Nombre total d'ar√™tes (liens de fraude) : {edges.count()}")
edges.show(5)

--- Cr√©ation des Edges (Liens Frauduleux Co-occurrents) ---
Nombre total d'ar√™tes (liens de fraude) : 7016
+-----------------+-----------------+
|              src|              dst|
+-----------------+-----------------+
|0.314596589729515|-1.58550536691994|
|-4.72771265581559|-2.58961719821269|
|-16.5986647432584|-25.2663550194138|
|-19.8563223334433|  -27.84818067198|
|-2.78724793061533|  -27.84818067198|
+-----------------+-----------------+
only showing top 5 rows



In [5]:
# Cellule 4/8 : Cr√©ation du GraphFrame et Analyse Structurale
from pyspark.sql.functions import col, desc, count as spark_count, avg

print("=" * 70)
print("--- V√©rification et Ajustement du Sch√©ma pour GraphFrame ---")
print("=" * 70)

# 1. Conversion des IDs en String (Obligatoire pour GraphFrames)
print("\nüîÑ Converting IDs to string format...")
vertices = vertices_final.withColumn("id", col("id").cast("string"))
edges = edges.withColumn("src", col("src").cast("string")) \
             .withColumn("dst", col("dst").cast("string"))

# V√©rification
print(f"‚úÖ Vertices: {vertices.count():,} nodes")
print(f"‚úÖ Edges: {edges.count():,} connections")

# 2. Cr√©ation du GraphFrame
print("\nüî® Creating GraphFrame...")
g = GraphFrame(vertices, edges)
print("‚úÖ GraphFrame cr√©√© avec succ√®s!")

# 3. Analyse des degr√©s
print("\n" + "=" * 70)
print("--- Analyse des Degr√©s (Degree Centrality) ---")
print("=" * 70)

try:
    print("\nüìä Top 5 des N≈ìuds (V1) les plus connect√©s (Degr√©):")
    degrees_df = g.degrees.orderBy(col("degree").desc())
    degrees_df.show(5)
    
    # Statistiques sur les degr√©s
    print("\nüìà Statistiques des degr√©s:")
    g.degrees.describe("degree").show()
    
    # Distribution des degr√©s
    print("\nüìä Distribution des degr√©s:")
    g.degrees.groupBy("degree").count() \
        .orderBy(desc("count")) \
        .show(10)
    
    # Identifier les n≈ìuds isol√©s
    isolated_nodes = g.degrees.filter(col("degree") == 1).count()
    print(f"\nüîç N≈ìuds isol√©s (degree = 1): {isolated_nodes:,}")
    
    # Identifier les hubs (n≈ìuds hautement connect√©s)
    high_degree_threshold = 50
    hubs = g.degrees.filter(col("degree") > high_degree_threshold)
    print(f"üö® N≈ìuds hautement connect√©s (degree > {high_degree_threshold}): {hubs.count():,}")
    
    if hubs.count() > 0:
        print("\nExemples de hubs:")
        hubs.orderBy(desc("degree")).show(10)
    
except Exception as e:
    print(f"‚ùå Avertissement: Le calcul des degr√©s a √©chou√©: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 70)

--- V√©rification et Ajustement du Sch√©ma pour GraphFrame ---

üîÑ Converting IDs to string format...
‚úÖ Vertices: 275,663 nodes
‚úÖ Edges: 7,016 connections

üî® Creating GraphFrame...
‚úÖ GraphFrame cr√©√© avec succ√®s!

--- Analyse des Degr√©s (Degree Centrality) ---

üìä Top 5 des N≈ìuds (V1) les plus connect√©s (Degr√©):




+-----------------+------+
|               id|degree|
+-----------------+------+
|-10.6457996485752|    84|
|-5.31417320646342|    84|
|-16.5265065691231|    84|
|-2.14441147422114|    84|
|-3.14025953779538|    84|
+-----------------+------+
only showing top 5 rows


üìà Statistiques des degr√©s:
+-------+------------------+
|summary|            degree|
+-------+------------------+
|  count|               472|
|   mean|29.728813559322035|
| stddev|20.826106846996957|
|    min|                 2|
|    max|                84|
+-------+------------------+


üìä Distribution des degr√©s:
+------+-----+
|degree|count|
+------+-----+
|    84|   43|
|    26|   42|
|    14|   40|
|    18|   40|
|    24|   39|
|    22|   36|
|    52|   27|
|    10|   24|
|    44|   23|
|    40|   21|
+------+-----+
only showing top 10 rows


üîç N≈ìuds isol√©s (degree = 1): 0
üö® N≈ìuds hautement connect√©s (degree > 50): 70

Exemples de hubs:
+------------------+------+
|                id|degree|
+------

In [6]:
# Cellule 5/8 : Algorithme des Composantes Connexes (CC)
from pyspark.sql.functions import col, desc, count as spark_count

print("=" * 70)
print("--- Algorithme des Composantes Connexes (Connected Components) ---")
print("=" * 70)

# D√©finir le r√©pertoire de checkpoint (obligatoire pour CC)
print("\nüîß Configuration du checkpoint directory...")
try:
    spark.sparkContext.setCheckpointDir("/tmp/gftest_checkpoints")
    print("‚úÖ R√©pertoire de checkpoint d√©fini: /tmp/gftest_checkpoints")
except Exception as e:
    print(f"‚ö†Ô∏è Avertissement: Impossible de d√©finir le checkpoint ({e}).")
    print("L'algorithme CC peut √©chouer.")
    
# Calcul des Composantes Connexes
print("\nüîÑ Calcul des Composantes Connexes en cours...")
print("‚è≥ Cela peut prendre quelques minutes...")

try:
    cc_results = g.connectedComponents().persist()
    print("‚úÖ Composantes Connexes calcul√©es!")
    
    # Analyse des R√©sultats
    print("\nüìä Analyse des composantes...")
    component_sizes = cc_results.groupBy("component") \
        .count() \
        .withColumnRenamed("count", "componentSize")
    
    cc_analysis = cc_results.join(component_sizes, "component")
    
    # Statistiques g√©n√©rales
    total_components = component_sizes.count()
    print(f"\nüìà Statistiques G√©n√©rales:")
    print(f"   Nombre total de composantes: {total_components:,}")
    
    # Distribution des tailles de composantes
    print("\nüìä Distribution des tailles de composantes:")
    component_sizes.orderBy(desc("componentSize")).show(10)
    
    print("\nüìà Statistiques des tailles:")
    component_sizes.describe("componentSize").show()
    
    # Afficher les statistiques du plus grand groupe
    print("\n" + "=" * 70)
    print("--- Analyse de la Plus Grande Composante ---")
    print("=" * 70)
    
    top_component = cc_analysis.orderBy(col("componentSize").desc()).limit(1).collect()[0]
    top_component_id = top_component["component"]
    total_size = top_component["componentSize"]
    
    # Compter les fraudes dans la plus grande composante
    fraud_in_top = cc_analysis.filter(col('component') == top_component_id) \
        .filter(col('label') == 1).count()
    fraud_purity = (fraud_in_top / total_size * 100) if total_size > 0 else 0
    
    print(f"\nüéØ Plus Grande Composante (ID: {top_component_id}):")
    print(f"   Taille Totale: {total_size:,} n≈ìuds")
    print(f"   Fraudes (label=1): {fraud_in_top:,}")
    print(f"   Puret√© (% fraudes): {fraud_purity:.2f}%")
    print(f"   Transactions normales: {total_size - fraud_in_top:,}")
    
    # Analyser les composantes de taille moyenne (potentiels r√©seaux de fraude)
    print("\n" + "=" * 70)
    print("--- Composantes Suspectes (Taille 5-50) ---")
    print("=" * 70)
    
    medium_components = component_sizes.filter(
        (col("componentSize") >= 5) & (col("componentSize") <= 50)
    ).orderBy(desc("componentSize"))
    
    print(f"\nNombre de composantes de taille moyenne: {medium_components.count():,}")
    
    if medium_components.count() > 0:
        print("\nTop 10 composantes de taille moyenne:")
        medium_components.show(10)
        
        # Analyser la puret√© de ces composantes
        print("\nüîç Analyse de puret√© des composantes moyennes:")
        for row in medium_components.limit(5).collect():
            comp_id = row["component"]
            comp_size = row["componentSize"]
            fraud_count = cc_analysis.filter(col('component') == comp_id) \
                .filter(col('label') == 1).count()
            purity = (fraud_count / comp_size * 100) if comp_size > 0 else 0
            
            status = "üö® SUSPECT" if purity > 70 else "‚úÖ Normal"
            print(f"   Component {comp_id}: {comp_size} n≈ìuds, {fraud_count} fraudes ({purity:.1f}%) {status}")
    
    # Composantes isol√©es (taille = 1)
    isolated = component_sizes.filter(col("componentSize") == 1).count()
    print(f"\nüîç Composantes isol√©es (1 n≈ìud): {isolated:,}")
    
    print("\n‚úÖ Analyse des Composantes Connexes termin√©e!")
    
except Exception as e:
    print(f"\n‚ùå Erreur lors du calcul des CC: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 70)

--- Algorithme des Composantes Connexes (Connected Components) ---

üîß Configuration du checkpoint directory...
‚úÖ R√©pertoire de checkpoint d√©fini: /tmp/gftest_checkpoints

üîÑ Calcul des Composantes Connexes en cours...
‚è≥ Cela peut prendre quelques minutes...
‚úÖ Composantes Connexes calcul√©es!

üìä Analyse des composantes...

üìà Statistiques G√©n√©rales:
   Nombre total de composantes: 275,236

üìä Distribution des tailles de composantes:
+---------+-------------+
|component|componentSize|
+---------+-------------+
|    20584|           43|
|    20650|           27|
|    22612|           23|
|    19647|           21|
|    23632|           18|
|    22068|           17|
|    15049|           16|
|    20867|           15|
|    21695|           14|
|     5258|           14|
+---------+-------------+
only showing top 10 rows


üìà Statistiques des tailles:
+-------+-------------------+
|summary|      componentSize|
+-------+-------------------+
|  count|             275236|


In [7]:
# Cellule 6/8 : Pr√©paration des Poids PageRank (Normalisation)
from pyspark.sql.functions import col, sum as spark_sum, lit, desc

print("=" * 70)
print("--- Initialisation des Poids PageRank (Normalisation des Ar√™tes) ---")
print("=" * 70)

# 1. Joindre 'edges' avec 'df' pour r√©cup√©rer 'Amount'
print("\nüîÑ Step 1: Joining edges with transaction amounts...")
edges_with_amount = edges.alias("e").join(
    df.select(col("V1").cast("string").alias("src_v1"), col("Amount")).distinct(), 
    col("e.src") == col("src_v1"),
    "left"
).select(col("e.src"), col("e.dst"), col("Amount")).distinct()

print(f"‚úÖ Edges with amounts: {edges_with_amount.count():,}")

# 2. Calculer la somme totale des montants sortants par source
print("\nüîÑ Step 2: Calculating total outgoing amounts per source...")
sum_weights = edges_with_amount.groupBy("src") \
    .agg(spark_sum("Amount").alias("total_outgoing_amount"))

print(f"‚úÖ Sum weights calculated for {sum_weights.count():,} sources")

# 3. Normalisation (Weight = Amount / Total Outgoing)
print("\nüîÑ Step 3: Normalizing edge weights...")
edges_with_sum = edges_with_amount.join(sum_weights, "src")
normalized_edges = edges_with_sum.withColumn(
    "weight", col("Amount") / col("total_outgoing_amount")
).select(col("src"), col("dst"), col("weight"))

print(f"‚úÖ Normalized edges created: {normalized_edges.count():,}")

# V√©rifier la normalisation
print("\nüìä Sample of normalized edges:")
normalized_edges.show(5)

print("\nüìà Weight statistics:")
normalized_edges.describe("weight").show()

# 4. Finaliser l'initialisation du PageRank (1/N)
print("\nüîÑ Step 4: Initializing PageRank values...")
N_vertices = vertices.count()

# Utiliser le DataFrame 'vertices' ajust√© des √©tapes pr√©c√©dentes
vertices = vertices.withColumn("pagerank", lit(1.0 / N_vertices)).cache()

print(f"\n‚úÖ Initialization complete!")
print(f"   Total vertices: {N_vertices:,}")
print(f"   Total normalized edges: {normalized_edges.count():,}")
print(f"   Initial PageRank (1/N): {1.0 / N_vertices:.8f}")

# V√©rifier l'initialisation
print("\nüìä Sample of initialized vertices:")
vertices.select("id", "pagerank", "label").show(5)

print("\n" + "=" * 70)

--- Initialisation des Poids PageRank (Normalisation des Ar√™tes) ---

üîÑ Step 1: Joining edges with transaction amounts...
‚úÖ Edges with amounts: 7,016

üîÑ Step 2: Calculating total outgoing amounts per source...
‚úÖ Sum weights calculated for 472 sources

üîÑ Step 3: Normalizing edge weights...
‚úÖ Normalized edges created: 7,016

üìä Sample of normalized edges:
+------------------+------------------+------------------+
|               src|               dst|            weight|
+------------------+------------------+------------------+
|-0.114360703589856| -1.46489654758402|               0.5|
|-0.114360703589856| -0.88525408859895|               0.5|
|-0.443793956538852|-0.264868683737295|0.0769230769230769|
|-0.443793956538852|  -2.7560071191969|0.0769230769230769|
|-0.443793956538852| -1.32278906321956|0.0769230769230769|
+------------------+------------------+------------------+
only showing top 5 rows


üìà Weight statistics:
+-------+--------------------+
|summary|     

In [8]:
# üîπ Cellule 7/8 : Algorithme PageRank (Simulation Native PySpark)
from pyspark.sql.functions import sum as spark_sum, lit, col, coalesce, desc
import time

MAX_ITER = 5
RESET_PROBABILITY = 0.15 

print("=" * 70)
print(f"--- PageRank Simul√© ({MAX_ITER} it√©rations, alpha={RESET_PROBABILITY}) ---")
print("=" * 70)

# 'vertices' contient id, label, Amount, pagerank initial
current_vertices = vertices.select("id", "label", "pagerank").cache()
N = float(current_vertices.count())

print(f"\nüìä Configuration:")
print(f"   Vertices: {int(N):,}")
print(f"   Edges: {normalized_edges.count():,}")
print(f"   Max iterations: {MAX_ITER}")
print(f"   Reset probability (Œ±): {RESET_PROBABILITY}")
print(f"   Initial PageRank: {1.0/N:.8f}")

print(f"\nüîÑ Starting PageRank iterations...")
start_time = time.time()

for i in range(MAX_ITER):
    iter_start = time.time()
    
    # 1. Calcul du Score de contribution (pr * weight)
    contributions = current_vertices.join(
        normalized_edges, current_vertices.id == normalized_edges.src
    ).withColumn(
        "contribution", col("pagerank") * col("weight")
    ).select(col("dst").alias("id"), "contribution")
    
    # 2. Agr√©gation des contributions
    new_pageranks = contributions.groupBy("id") \
        .agg(spark_sum("contribution").alias("sum_contribution"))
    
    # 3. Application de la formule PageRank
    current_vertices = current_vertices.drop("pagerank") \
                                       .join(new_pageranks, "id", "left_outer") \
                                       .withColumn(
                                           "sum_contribution_clean", 
                                           coalesce(col("sum_contribution"), lit(0))
                                       )
    
    # PR(new) = (1 - alpha) * PR(contribution) + alpha / N
    current_vertices = current_vertices.withColumn(
        "pagerank", 
        lit(1.0 - RESET_PROBABILITY) * col("sum_contribution_clean") + lit(RESET_PROBABILITY / N)
    ).select("id", "label", "pagerank").cache()
    
    iter_time = time.time() - iter_start
    
    # Show progress with statistics
    avg_pr = current_vertices.agg({"pagerank": "avg"}).collect()[0][0]
    max_pr = current_vertices.agg({"pagerank": "max"}).collect()[0][0]
    
    print(f"   Iteration {i+1}/{MAX_ITER} completed in {iter_time:.2f}s - Avg PR: {avg_pr:.8f}, Max PR: {max_pr:.8f}")

total_time = time.time() - start_time
print(f"\n‚úÖ PageRank completed in {total_time:.2f}s ({total_time/MAX_ITER:.2f}s per iteration)")

# R√©int√©grer la colonne Amount pour l'affichage final
print("\nüîÑ Preparing final results...")
results = current_vertices.join(
    vertices.select(
        "id", 
        "Amount", 
        col("label").alias("original_label")
    ), 
    "id", 
    "left"
).drop("label").withColumnRenamed("original_label", "label")

results = results.orderBy(col("pagerank").desc()).cache()

print("\n" + "=" * 70)
print("--- Analyse des R√©sultats PageRank ---")
print("=" * 70)

# Statistiques PageRank
print("\nüìà Statistiques PageRank:")
results.describe("pagerank").show()

# Top nodes
print("\nüèÜ Top 10 Nodes par PageRank:")
results.select("id", "pagerank", "label", "Amount").show(10, truncate=False)

# Analyser par label
print("\nüìä PageRank moyen par Label:")
from pyspark.sql.functions import avg, max as spark_max, min as spark_min, count as spark_count

results.groupBy("label") \
    .agg(
        spark_count("*").alias("count"),
        avg("pagerank").alias("avg_pagerank"),
        spark_max("pagerank").alias("max_pagerank"),
        spark_min("pagerank").alias("min_pagerank")
    ).orderBy("label").show()

# Identifier les n≈ìuds suspects (high PageRank + fraud label)
print("\nüö® N≈ìuds suspects (High PageRank + Label=1):")
suspicious = results.filter(col("label") == 1).orderBy(desc("pagerank"))
print(f"   Total frauds: {suspicious.count():,}")
print("   Top 10:")
suspicious.show(10, truncate=False)

print("\n‚úÖ PageRank Simulation termin√©e. R√©sultats pr√™ts.")
print("=" * 70)

--- PageRank Simul√© (5 it√©rations, alpha=0.15) ---

üìä Configuration:
   Vertices: 275,663
   Edges: 7,016
   Max iterations: 5
   Reset probability (Œ±): 0.15
   Initial PageRank: 0.00000363

üîÑ Starting PageRank iterations...
   Iteration 1/5 completed in 0.27s - Avg PR: 0.00000055, Max PR: 0.00000363
   Iteration 2/5 completed in 0.26s - Avg PR: 0.00000055, Max PR: 0.00000363
   Iteration 3/5 completed in 0.37s - Avg PR: 0.00000055, Max PR: 0.00000363
   Iteration 4/5 completed in 0.53s - Avg PR: 0.00000055, Max PR: 0.00000363
   Iteration 5/5 completed in 1.10s - Avg PR: 0.00000055, Max PR: 0.00000363

‚úÖ PageRank completed in 22.50s (4.50s per iteration)

üîÑ Preparing final results...

--- Analyse des R√©sultats PageRank ---

üìà Statistiques PageRank:
+-------+--------------------+
|summary|            pagerank|
+-------+--------------------+
|  count|              275663|
|   mean|5.485894026413845E-7|
| stddev| 1.10792783122001E-7|
|    min|5.441426669520393E-7|
|    

In [9]:
# Cellule 8/8 : Affichage et Interpr√©tation des R√©sultats Finaux
from pyspark.sql.functions import (
    col, desc, count as spark_count, avg, 
    max as spark_max, min as spark_min
)

print("=" * 70)
print("--- R√âSULTATS FINAUX : ANALYSE GRAPHX ---")
print("=" * 70)

# Renommage de la colonne pour la clart√©
final_results = results.withColumnRenamed("pagerank", "PR_Score")

# 1. Top 10 des Hubs Potentiels
print("\n" + "=" * 70)
print("--- Top 10 des Clients/Cartes (V1) avec PageRank le plus √©lev√© ---")
print("=" * 70)
final_results.select("id", "label", "Amount", "PR_Score").show(10, truncate=False)

# 2. Cas potentiels de Mules
print("\n" + "=" * 70)
print("--- Top 10 Transactions L√©gitimes avec PageRank √©lev√© ---")
print("(Mules potentielles)")
print("=" * 70)
mules = final_results.filter(col("label") == 0) \
    .orderBy(col("PR_Score").desc())
mules.select("id", "label", "Amount", "PR_Score").show(10, truncate=False)

# 3. Fraudes confirm√©es
print("\n" + "=" * 70)
print("--- Top 10 Fraudes Confirm√©es (Label=1) ---")
print("=" * 70)
confirmed_frauds = final_results.filter(col("label") == 1) \
    .orderBy(col("PR_Score").desc())
confirmed_frauds.select("id", "label", "Amount", "PR_Score").show(10, truncate=False)

# 4. Distribution par Label - CORRECTED
print("\n" + "=" * 70)
print("--- Distribution PageRank par Label ---")
print("=" * 70)
print("\nüìä Statistiques par Label:")

final_results.groupBy("label").agg(
    spark_count("*").alias("count"),
    avg("PR_Score").alias("avg_PR"),
    spark_max("PR_Score").alias("max_PR"),
    spark_min("PR_Score").alias("min_PR")
).orderBy("label").show()

# 5. Conclusion
print("\n" + "=" * 70)
print("--- INTERPR√âTATION FINALE ---")
print("=" * 70)

total_components = cc_analysis.select('component').distinct().count()
total_nodes = final_results.count()
total_frauds = final_results.filter(col("label") == 1).count()

print(f"\nüìä R√©sum√©:")
print(f"   CC: {total_components} groupes de fraude d√©tect√©s")
print(f"   N≈ìuds: {total_nodes:,} ({total_frauds:,} fraudes)")
print(f"   PageRank: Scores de risque calcul√©s pour priorisation")
print(f"\n‚úÖ La combinaison CC + PageRank fournit une vision compl√®te!")
print("=" * 70)

--- R√âSULTATS FINAUX : ANALYSE GRAPHX ---

--- Top 10 des Clients/Cartes (V1) avec PageRank le plus √©lev√© ---
+-----------------+-----+------+--------------------+
|id               |label|Amount|PR_Score            |
+-----------------+-----+------+--------------------+
|1.08837493830851 |1    |3.79  |3.627617779680262E-6|
|0.753356012421118|1    |2.0   |3.627617779680262E-6|
|-14.4744374924863|1    |1.0   |3.627617779680262E-6|
|0.269614090485094|1    |0.68  |3.627617779680262E-6|
|-15.2713618637585|1    |1.0   |3.627617779680262E-6|
|1.1939160689293  |1    |31.91 |3.627617779680262E-6|
|-2.92194437582996|1    |723.21|3.627617779680262E-6|
|-2.33565492855671|1    |444.17|3.627617779680262E-6|
|-1.78322883722709|1    |1.0   |3.627617779680262E-6|
|-3.5934760029271 |1    |101.5 |3.627617779680262E-6|
+-----------------+-----+------+--------------------+
only showing top 10 rows


--- Top 10 Transactions L√©gitimes avec PageRank √©lev√© ---
(Mules potentielles)
+------------------+--

In [10]:
# ============================================================================
# SAVE AND VERIFY GRAPHX RESULTS
# ============================================================================

from pyspark.sql.functions import desc
import time

print("\n" + "=" * 70)
print("--- SAVING RESULTS ---")
print("=" * 70)

timestamp = int(time.time())

# Save PageRank results
print("\nüíæ Saving PageRank results...")
try:
    pagerank_path = f"hdfs://namenode:8020/data/graphx/pagerank_{timestamp}"
    final_results.coalesce(5).write.mode("overwrite").parquet(pagerank_path)
    print(f"‚úÖ Saved: {pagerank_path}")
except Exception as e:
    print(f"‚ö†Ô∏è Save failed: {e}")

# Save top results as CSV
print("\nüíæ Saving top 100 results as CSV...")
try:
    csv_path = f"hdfs://namenode:8020/data/graphx/top_nodes_{timestamp}"
    final_results.limit(100).coalesce(1) \
        .write.mode("overwrite") \
        .option("header", "true") \
        .csv(csv_path)
    print(f"‚úÖ Saved: {csv_path}")
except Exception as e:
    print(f"‚ö†Ô∏è Save failed: {e}")

# ============================================================================
# VERIFY SAVED RESULTS
# ============================================================================

print("\n" + "=" * 70)
print("--- VERIFYING SAVED RESULTS IN HDFS ---")
print("=" * 70)

# Load PageRank results from the previous successful save
print("\nüì• Loading PageRank results from HDFS...")
try:
    pagerank_saved = spark.read.parquet("hdfs://namenode:8020/data/graphx/pagerank_results_1768187013")
    print(f"‚úÖ Loaded {pagerank_saved.count():,} rows")
    
    print("\nüèÜ Top 10 nodes from saved results:")
    pagerank_saved.orderBy(desc("PR_Score")).show(10, truncate=False)
    
    print("\nüìä Statistics by label:")
    from pyspark.sql.functions import avg, max as spark_max, min as spark_min, count as spark_count
    
    pagerank_saved.groupBy("label").agg(
        spark_count("*").alias("count"),
        avg("PR_Score").alias("avg_PR"),
        spark_max("PR_Score").alias("max_PR")
    ).show()
    
except Exception as e:
    print(f"‚ö†Ô∏è Could not load previous results: {e}")

print("\n" + "=" * 70)
print("üéâ GRAPHX ANALYSIS COMPLETE!")
print("=" * 70)
print("\n‚úÖ Your results are saved in HDFS:")
print(f"   ‚Ä¢ PageRank: hdfs://namenode:8020/data/graphx/pagerank_results_1768187013")
print(f"   ‚Ä¢ Vertices: hdfs://namenode:8020/data/graphx/vertices_1768187013")
if 'pagerank_path' in locals():
    print(f"   ‚Ä¢ Latest: {pagerank_path}")
print("\nüí° Load anytime with:")
print("   spark.read.parquet('hdfs://namenode:8020/data/graphx/pagerank_results_1768187013')")
print("=" * 70)


--- SAVING RESULTS ---

üíæ Saving PageRank results...
‚úÖ Saved: hdfs://namenode:8020/data/graphx/pagerank_1768252535

üíæ Saving top 100 results as CSV...
‚úÖ Saved: hdfs://namenode:8020/data/graphx/top_nodes_1768252535

--- VERIFYING SAVED RESULTS IN HDFS ---

üì• Loading PageRank results from HDFS...
‚úÖ Loaded 275,663 rows

üèÜ Top 10 nodes from saved results:
+-----------------+--------------------+------+-----+
|id               |PR_Score            |Amount|label|
+-----------------+--------------------+------+-----+
|1.08837493830851 |3.627617779680262E-6|3.79  |1    |
|0.753356012421118|3.627617779680262E-6|2.0   |1    |
|-14.4744374924863|3.627617779680262E-6|1.0   |1    |
|0.269614090485094|3.627617779680262E-6|0.68  |1    |
|-15.2713618637585|3.627617779680262E-6|1.0   |1    |
|1.1939160689293  |3.627617779680262E-6|31.91 |1    |
|-2.92194437582996|3.627617779680262E-6|723.21|1    |
|-2.33565492855671|3.627617779680262E-6|444.17|1    |
|-1.78322883722709|3.627617779680