In [1]:
import os
os.environ["SPARK_HOME"] = "/home/djeghali/spark"
os.environ["JAVA_HOME"] = "/usr"

# Importation des bibliothéques

In [2]:
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkConf

# for dataframe and udf
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *


from pyspark.sql.functions import col
from pyspark.sql import functions as F

import igraph as ig

# Lancer Spark

In [3]:


# initialise environment variables for spark
findspark.init()

# Start spark session
# --------------------------
def start_spark():
  local = "local[*]"
  appName = "PLDAC"

  gf = "graphframes:graphframes:0.8.3-spark3.5-s_2.12"

  configLocale = SparkConf().setAppName(appName).setMaster(local).\
  set("spark.executor.memory", "6G").\
  set("spark.driver.memory","6G").\
  set("spark.sql.catalogImplementation","in-memory").\
  set("spark.jars.packages", gf)

  spark = SparkSession.builder.config(conf = configLocale).getOrCreate()
  sc = spark.sparkContext
  sc.setLogLevel("ERROR")

  spark.conf.set("spark.sql.autoBroadcastJoinThreshold","-1")

  # Adjust the query execution environment to the size of the cluster (4 cores)
  spark.conf.set("spark.sql.shuffle.partitions","4")
  print("session started, its id is ", sc.applicationId)
  return spark
spark = start_spark()

session started, its id is  local-1708955977766


# Lecture des fichiers du dossier BDLE_10K

In [4]:
# Initialisez une session Spark
spark = SparkSession.builder.appName("Proteines_graph").getOrCreate()

parquet_folder = "./local/data/BDLE_10K"

# Récuperer tous les fichiers Parquet compressé avec Snappy
parquet_files = parquet_folder + "/*.snappy.parquet"

#Création du data frame
df = spark.read.format("parquet").option("compression", "snappy").load(parquet_files)

# Affichez les 10 premiére ligne du DataFrame
df.show(10)


+-----------+-----------+-----+
|     seqID1|     seqID2|  sim|
+-----------+-----------+-----+
|117761605:5|152890023:5| 97.8|
|152904885:3|155591878:2| 88.5|
|152887848:4|153682181:0|100.0|
|152937692:5| 80009514:2| 82.3|
|152990923:2|154549183:4| 98.0|
|152867782:1|153171917:1| 83.1|
| 15111981:2|153137370:1|100.0|
|152794195:0| 15280704:2| 96.9|
| 62963742:1| 63783418:5| 98.4|
|152170568:3|153062631:2| 97.6|
+-----------+-----------+-----+
only showing top 10 rows



# Partionnement des arêtes en plusieurs sous ensembles

In [5]:

# Nombre de partitions
num_partitions = 20

# Ajout d'une nouvelle colonne 'partition' basée sur le modulo de seqID1
#F.hash(col('seqID1') Calcul le hash d'une colonne pour partitionner les données.
df_partitioned = df.withColumn('partition', (F.hash(col('seqID1')) % num_partitions + num_partitions) % num_partitions)

# Afficher le DataFrame partitionné
df_partitioned.show()


+-----------+-----------+-----+---------+
|     seqID1|     seqID2|  sim|partition|
+-----------+-----------+-----+---------+
|117761605:5|152890023:5| 97.8|        7|
|152904885:3|155591878:2| 88.5|        0|
|152887848:4|153682181:0|100.0|        0|
|152937692:5| 80009514:2| 82.3|        0|
|152990923:2|154549183:4| 98.0|        0|
|152867782:1|153171917:1| 83.1|       15|
| 15111981:2|153137370:1|100.0|        6|
|152794195:0| 15280704:2| 96.9|        0|
| 62963742:1| 63783418:5| 98.4|       17|
|152170568:3|153062631:2| 97.6|        5|
|152904832:4|154500443:5| 98.6|        0|
|152745429:3| 22988511:2|100.0|        0|
|153161980:3|154256473:0|100.0|        0|
|152903373:4|154426964:5| 80.4|        0|
|146010871:1|158434400:4| 98.7|        2|
|153040013:0|156563807:1| 96.5|        0|
|152933621:4|157634534:2| 96.8|        0|
|153141720:0|155282497:0| 93.2|        0|
|153082115:5|153887336:3| 90.0|        0|
|153025447:3|155660759:1| 97.4|        0|
+-----------+-----------+-----+---

In [6]:
#La taille(nombre d'arêtes) de chaque partition
partition_freq = df_partitioned.groupBy('partition').count()
partition_freq.show()

+---------+-----+
|partition|count|
+---------+-----+
|       14|  883|
|       12|  821|
|       18|  752|
|       13|  835|
|       15|  859|
|        6|  862|
|       17|  819|
|        9|  907|
|       19|  810|
|       16|  888|
|        5|  770|
|        2| 1170|
|       10|  878|
|        4|  741|
|        7|  883|
|        0| 4906|
|        1|  771|
|        8|  763|
|        3|  772|
|       11|  752|
+---------+-----+



In [7]:
# Créer un DataFrame pour chaque partition
dfs_partitioned = []
for partition_id in range(num_partitions):
    df_partition = df_partitioned.filter(col('partition') == partition_id)
    dfs_partitioned.append(df_partition)

# Afficher les DataFrames de chaque partition
for i, df_partition in enumerate(dfs_partitioned):
    print(f"Partition {i}:")
    df_partition.show()

Partition 0:
+-----------+-----------+-----+---------+
|     seqID1|     seqID2|  sim|partition|
+-----------+-----------+-----+---------+
|152904885:3|155591878:2| 88.5|        0|
|152887848:4|153682181:0|100.0|        0|
|152937692:5| 80009514:2| 82.3|        0|
|152990923:2|154549183:4| 98.0|        0|
|152794195:0| 15280704:2| 96.9|        0|
|152904832:4|154500443:5| 98.6|        0|
|152745429:3| 22988511:2|100.0|        0|
|153161980:3|154256473:0|100.0|        0|
|152903373:4|154426964:5| 80.4|        0|
|153040013:0|156563807:1| 96.5|        0|
|152933621:4|157634534:2| 96.8|        0|
|153141720:0|155282497:0| 93.2|        0|
|153082115:5|153887336:3| 90.0|        0|
|153025447:3|155660759:1| 97.4|        0|
|152921945:1|157742939:2| 92.6|        0|
|152901882:2| 79639334:0| 81.7|        0|
|152772567:4|153848527:5| 90.3|        0|
|152897014:5|155771540:2| 84.1|        0|
|152778738:2|153768354:1| 98.2|        0|
|152985004:4|156710084:4| 85.7|        0|
+-----------+--------

# Calcul des composantes connexes de chaque partition

In [8]:
from igraph import Graph

def calcul_composantes(df):
    edges = df.select('seqID1', 'seqID2').collect()
    # Créer un graphe à partir des arêtes
    g = Graph.TupleList(edges, directed=False)
    # Trouver les composantes connexes
    connected_components = g.connected_components()
    # print("Les composantes : ", connected_components)
    # print(g.vs[connected_components [1]])
    node_ids=[]
    for component in connected_components:
        node_ids.append([g.vs[node_index]['name'] for node_index in component])

    return node_ids



In [9]:
dfs_partitioned[0].show()

+-----------+-----------+-----+---------+
|     seqID1|     seqID2|  sim|partition|
+-----------+-----------+-----+---------+
|152904885:3|155591878:2| 88.5|        0|
|152887848:4|153682181:0|100.0|        0|
|152937692:5| 80009514:2| 82.3|        0|
|152990923:2|154549183:4| 98.0|        0|
|152794195:0| 15280704:2| 96.9|        0|
|152904832:4|154500443:5| 98.6|        0|
|152745429:3| 22988511:2|100.0|        0|
|153161980:3|154256473:0|100.0|        0|
|152903373:4|154426964:5| 80.4|        0|
|153040013:0|156563807:1| 96.5|        0|
|152933621:4|157634534:2| 96.8|        0|
|153141720:0|155282497:0| 93.2|        0|
|153082115:5|153887336:3| 90.0|        0|
|153025447:3|155660759:1| 97.4|        0|
|152921945:1|157742939:2| 92.6|        0|
|152901882:2| 79639334:0| 81.7|        0|
|152772567:4|153848527:5| 90.3|        0|
|152897014:5|155771540:2| 84.1|        0|
|152778738:2|153768354:1| 98.2|        0|
|152985004:4|156710084:4| 85.7|        0|
+-----------+-----------+-----+---

In [10]:
components = calcul_composantes(dfs_partitioned[0])
print(len(components))

2961


In [11]:
# Nombre de lignes
nb_rows = dfs_partitioned[0].count()

# Liste des noms des colonnes
columns_list = dfs_partitioned[0].columns

# Nombre de colonnes
nb_columns = len(columns_list)

# Affichage des informations
print(f"Nombre de lignes : {nb_rows}")
print(f"Nombre de colonnes : {nb_columns}")
print(f"Noms des colonnes : {columns_list}")

Nombre de lignes : 4906
Nombre de colonnes : 4
Noms des colonnes : ['seqID1', 'seqID2', 'sim', 'partition']


In [12]:
# components = []
# for df_part in dfs_partitioned : 

#     components.append(calcul_composantes(df_part))
composantes = calcul_composantes(dfs_partitioned[0])
composantes_triees = sorted(composantes, key=len, reverse=True)
print("Liste des compsantes : ",composantes)
print("Composnates triées : ", composantes_triees)
print("Nous avons obtenu :", len(composantes_triees),"composantes")


Liste des compsantes :  [['152904885:3', '155591878:2'], ['152887848:4', '153682181:0'], ['152937692:5', '80009514:2'], ['152990923:2', '154549183:4'], ['152794195:0', '15280704:2'], ['152904832:4', '154500443:5'], ['152745429:3', '22988511:2'], ['153161980:3', '154256473:0'], ['152903373:4', '154426964:5'], ['153040013:0', '156563807:1'], ['152933621:4', '157634534:2'], ['153141720:0', '155282497:0'], ['153082115:5', '153887336:3'], ['153025447:3', '155660759:1'], ['152921945:1', '157742939:2'], ['152901882:2', '79639334:0'], ['152772567:4', '153848527:5'], ['152897014:5', '155771540:2'], ['152778738:2', '153768354:1'], ['152985004:4', '156710084:4'], ['152782228:1', '152810615:4'], ['153099598:1', '155793668:2'], ['152775953:1', '156128239:0'], ['153037119:3', '153710066:2'], ['152771539:3', '153588430:0'], ['153180239:2', '62246699:2'], ['152782911:2', '154263685:0'], ['152768681:0', '27955181:2'], ['152778422:3', '153519684:5'], ['152759894:5', '158177234:1'], ['152852603:0', '1577

In [13]:
# Créer une liste de lignes pour chaque composante
rows = []
for i, component in enumerate(composantes_triees):
    for seqid in component:
        rows.append(Row(nom_composante=i, SEQid=seqid))

# Créer un DataFrame Spark à partir de la liste de lignes
result_df = spark.createDataFrame(rows)

# Afficher le DataFrame résultant
result_df.show()

+--------------+-----------+
|nom_composante|      SEQid|
+--------------+-----------+
|             0|148798868:4|
|             0| 18808706:1|
|             0|  1909920:3|
|             0| 29456800:5|
|             0|105906721:3|
|             0| 42532231:2|
|             0|  3704555:2|
|             0| 39381824:2|
|             0| 51953996:2|
|             0|149109793:4|
|             0| 47384083:1|
|             0| 68833237:2|
|             0| 19191844:2|
|             0|106040287:0|
|             1|142280049:5|
|             1| 63225406:3|
|             1|142280051:5|
|             1| 63225406:4|
|             1|142280050:5|
|             1| 63225406:5|
+--------------+-----------+
only showing top 20 rows



In [14]:
from pyspark.sql.functions import lit

# Définir la fonction UDF
def calcul_composantes(df):
    edges = df.select('seqID1', 'seqID2').collect()
    # Créer un graphe à partir des arêtes
    g = Graph.TupleList(edges, directed=False)
    # Trouver les composantes connexes
    connected_components = g.components().membership
    return [int(i) for i in connected_components]  # Convertir les valeurs en entiers

# Appliquer la fonction UDF
inx_component = calcul_composantes(dfs_partitioned[1])
data = dfs_partitioned[1]
df_with_components = data.withColumn("connected_components", lit(inx_component))

# Afficher le DataFrame résultant
df_with_components.show()

+-----------+-----------+-----+---------+--------------------+
|     seqID1|     seqID2|  sim|partition|connected_components|
+-----------+-----------+-----+---------+--------------------+
|152797616:4|153077244:3| 80.9|        1|[0, 0, 1, 1, 2, 2...|
|152195812:3|152910392:0| 97.2|        1|[0, 0, 1, 1, 2, 2...|
|152233364:3|152860059:4|100.0|        1|[0, 0, 1, 1, 2, 2...|
|152228143:5|152971380:4| 98.3|        1|[0, 0, 1, 1, 2, 2...|
|152716349:0|153040585:3| 97.4|        1|[0, 0, 1, 1, 2, 2...|
|126635248:5|131579132:2| 80.4|        1|[0, 0, 1, 1, 2, 2...|
|154165205:2|156325288:2|100.0|        1|[0, 0, 1, 1, 2, 2...|
| 58291451:3| 64776984:4| 89.1|        1|[0, 0, 1, 1, 2, 2...|
|126635248:5|126803785:3| 84.0|        1|[0, 0, 1, 1, 2, 2...|
|152641991:5|153416927:2| 98.6|        1|[0, 0, 1, 1, 2, 2...|
|146952620:5| 49941002:1| 90.1|        1|[0, 0, 1, 1, 2, 2...|
|116187856:1|153331872:1|100.0|        1|[0, 0, 1, 1, 2, 2...|
|153348971:3|153348973:4| 98.3|        1|[0, 0, 1, 1, 2

### Calcul du data frame des composantes connexes : 

In [120]:
def calcul_composantes(df):
    edges = df.select('seqID1', 'seqID2').collect()
    # Créer un graphe à partir des arêtes
    g = Graph.TupleList(edges, directed=False)
    # Trouver les composantes connexes
    connected_components = g.connected_components()
    node_ids=[]
    print(len(connected_components))
    for component in connected_components:
        node_ids.append([g.vs[node_index]['name'] for node_index in component])
    composantes_triees = sorted(node_ids, key=len, reverse=True)
    # Créer une liste de lignes pour chaque composante
    rows = []
    for i, component in enumerate(composantes_triees):
        for seqid in component:
            rows.append(Row(nom_composante=i, SEQid=seqid))
    
    # Créer un DataFrame Spark à partir de la liste de lignes
    result_df = spark.createDataFrame(rows)
    return result_df


In [121]:
#Liste des composante de la première partition des données de taille 10k
calcul_composantes(dfs_partitioned[0]).show()

2961
+--------------+-----------+
|nom_composante|      SEQid|
+--------------+-----------+
|             0|148798868:4|
|             0| 18808706:1|
|             0|  1909920:3|
|             0| 29456800:5|
|             0|105906721:3|
|             0| 42532231:2|
|             0|  3704555:2|
|             0| 39381824:2|
|             0| 51953996:2|
|             0|149109793:4|
|             0| 47384083:1|
|             0| 68833237:2|
|             0| 19191844:2|
|             0|106040287:0|
|             1|142280049:5|
|             1| 63225406:3|
|             1|142280051:5|
|             1| 63225406:4|
|             1|142280050:5|
|             1| 63225406:5|
+--------------+-----------+
only showing top 20 rows



### Calcul des composantes connexes partielles de toutes les partitons de tout le dossier 10 K :

In [24]:
#Liste des composantes de toutes les données de taille 10k dans plusieurs dataframes
dfs_composantes_connexes_partielles = []
for partition in dfs_partitioned :
    dfs_composantes_connexes_partielles.append(calcul_composantes(partition))


### Affichage des composantes partielles de chaque partition :

In [25]:
for partition in dfs_composantes_connexes_partielles :
    partition.show()


+--------------+-----------+
|nom_composante|      SEQid|
+--------------+-----------+
|             0|148798868:4|
|             0| 18808706:1|
|             0|  1909920:3|
|             0| 29456800:5|
|             0|105906721:3|
|             0| 42532231:2|
|             0|  3704555:2|
|             0| 39381824:2|
|             0| 51953996:2|
|             0|149109793:4|
|             0| 47384083:1|
|             0| 68833237:2|
|             0| 19191844:2|
|             0|106040287:0|
|             1|142280049:5|
|             1| 63225406:3|
|             1|142280051:5|
|             1| 63225406:4|
|             1|142280050:5|
|             1| 63225406:5|
+--------------+-----------+
only showing top 20 rows

+--------------+--------------------+
|nom_composante|               SEQid|
+--------------+--------------------+
|             0|          50649042:1|
|             0|          53289965:4|
|             0|          32073629:3|
|             0|          52731508:5|
|           

### Reconstruction des composantes à partir des composantes partielles : 
-   Afin d'éviter la fusion de deux composnates ayant le même numéro de composante dans 2 dataframe différents nous ajoutant une colonne pour chaque dataframe pour specifié le numéro du dataframe à partir duquel la composante a été crée .

In [28]:
# Créer une colonne supplémentaire pour chaque dataframe afin d'indiquer son origine
dfs_composantes_connexes_partielles_with_indice_partition = [df.withColumn("source_df", lit(i)) for i, df in enumerate(dfs_composantes_connexes_partielles)]


In [126]:
dfs_composantes_connexes_partielles_with_indice_partition[5].show()


+--------------+----------+---------+
|nom_composante|     SEQid|source_df|
+--------------+----------+---------+
|             0|41164712:3|        5|
|             0|41218553:0|        5|
|             0|41185803:1|        5|
|             0|41205992:1|        5|
|             0|41179534:1|        5|
|             0|41203560:1|        5|
|             0|41268248:0|        5|
|             0|91521358:0|        5|
|             0|41193498:4|        5|
|             0|41230700:2|        5|
|             0|41260617:1|        5|
|             1|78512526:4|        5|
|             1|93105297:0|        5|
|             1|81406437:2|        5|
|             1|92408891:0|        5|
|             1|90939574:1|        5|
|             1|85601631:2|        5|
|             1|91122085:2|        5|
|             1|81490522:2|        5|
|             1|92029238:5|        5|
+--------------+----------+---------+
only showing top 20 rows



Fusion de toutes les composantes partielles dans un même dataframe

In [42]:

from functools import reduce
merged_df_composantes_partielles = reduce(lambda x, y: x.union(y), dfs_composantes_connexes_partielles_with_indice_partition)
merged_df_composantes_partielles[merged_df_composantes_partielles['nom_composante']==0].show()

+--------------+-----------+---------+
|nom_composante|      SEQid|source_df|
+--------------+-----------+---------+
|             0|148798868:4|        0|
|             0| 18808706:1|        0|
|             0|  1909920:3|        0|
|             0| 29456800:5|        0|
|             0|105906721:3|        0|
|             0| 42532231:2|        0|
|             0|  3704555:2|        0|
|             0| 39381824:2|        0|
|             0| 51953996:2|        0|
|             0|149109793:4|        0|
|             0| 47384083:1|        0|
|             0| 68833237:2|        0|
|             0| 19191844:2|        0|
|             0|106040287:0|        0|
|             0| 50649042:1|        1|
|             0| 53289965:4|        1|
|             0| 32073629:3|        1|
|             0| 52731508:5|        1|
|             0| 49292152:2|        1|
|             0|  8541846:1|        1|
+--------------+-----------+---------+
only showing top 20 rows



In [142]:
# Pour chaque noeud, obtenir la composante ayant la plus petite valeur et son dataframe d'origine
from pyspark.sql.window import Window


# Créer une fenêtre pour appliquer dense_rank() sur l'ensemble du DataFrame
windowSpec = Window.orderBy("nom_composante", "source_df")

# Ajouter une colonne d'identifiant unique basée sur le couple (nom_composante, source_df)
merged_df_composantes_partielles_unique = merged_df_composantes_partielles.withColumn("composantes_id", dense_rank().over(windowSpec))
merged_df_composantes_partielles_unique_nettoye = merged_df_composantes_partielles_unique.select("SEQid","composantes_id").dropDuplicates()

merged_df_composantes_partielles_unique_nettoye.show()


+-----------+--------------+
|      SEQid|composantes_id|
+-----------+--------------+
|148798868:4|             1|
| 18808706:1|             1|
|  1909920:3|             1|
| 29456800:5|             1|
|105906721:3|             1|
| 42532231:2|             1|
|  3704555:2|             1|
| 39381824:2|             1|
| 51953996:2|             1|
|149109793:4|             1|
| 47384083:1|             1|
| 68833237:2|             1|
| 19191844:2|             1|
|106040287:0|             1|
| 50649042:1|             2|
| 53289965:4|             2|
| 32073629:3|             2|
| 52731508:5|             2|
| 49292152:2|             2|
|  8541846:1|             2|
+-----------+--------------+
only showing top 20 rows



#### Exemple de noeud présent dans différentes partitions :

In [140]:
result = merged_df_composantes_partielles_unique_nettoye.filter(merged_df_composantes_partielles_unique_nettoye['SEQid'] == '8541846:1')
result.show()


+---------+--------------+
|    SEQid|composantes_id|
+---------+--------------+
|8541846:1|             2|
|8541846:1|            99|
|8541846:1|           453|
|8541846:1|           704|
|8541846:1|           888|
|8541846:1|          3617|
+---------+--------------+



31348

### Regrouper les composantes partielles :
Ici pour chaque noeud appartenant à deux composantes différentes on choisi l'indice des composante le plus petit :

In [145]:
from pyspark.sql.functions import collect_set, min as min_, expr,array_min,max

# 1. Identifier toutes les composantes auxquelles chaque nœud appartient
components_per_node = merged_df_composantes_partielles_unique_nettoye.groupby("SEQid").agg(collect_set("composantes_id").alias("composantes"))

# 2. Trouver la composante la plus petite parmi celles auxquelles chaque nœud appartient
components_per_node = components_per_node.withColumn("min_composante", array_min("composantes"))

# # # 3. Mettre à jour la colonne de composante pour chaque nœud en utilisant la plus petite composante trouvée
components_per_node = merged_df_composantes_partielles_unique_nettoye.join(components_per_node, on="SEQid", how="left_outer")
components_per_node.show(100)

# # components_per_node.count()
# print(components_per_node.count())

# # components_per_node_2 = components_per_node.withColumn("composantes_id_new", expr("CASE WHEN array_contains(composantes, min_composante) THEN min_composante ELSE composantes_id END"))

# # 4. Mettre également à jour les autres nœuds qui appartiennent aux mêmes composantes que ces nœuds mis à jour avec la plus petite valeur de composante
# merged_df_composantes_partielles_unique_composantes_connexes = components_per_node.withColumn("composantes_globale_id", min_("min_composante").over(Window.partitionBy("composantes_id")))

# # Afficher le résultat final
# merged_df_composantes_partielles_unique_composantes_connexes.show(200)


+--------------------+--------------+--------------------+--------------+
|               SEQid|composantes_id|         composantes|min_composante|
+--------------------+--------------+--------------------+--------------+
|         105906721:3|             1|                 [1]|             1|
|         106040287:0|             1|            [1, 243]|             1|
|         107642134:1|             5|                 [5]|             5|
|         109694874:5|             8|                 [8]|             8|
|         115038930:4|             8|           [1152, 8]|             8|
|         115038930:5|             8|            [241, 8]|             8|
|         120092910:2|             5|            [5, 214]|             5|
|         120164806:4|             5|      [5, 214, 1611]|             5|
|         127463467:4|             7|                 [7]|             7|
|         127570029:0|             5| [5, 214, 1611, 675]|             5|
|         127824766:5|             7| 

In [114]:
merged_df_composantes_partielles_unique_composantes_connexes_nettoye = merged_df_composantes_partielles_unique_composantes_connexes.select("SEQid","composantes_globale_id").dropDuplicates()
merged_df_composantes_partielles_unique_composantes_connexes_nettoye.show(500)

+--------------------+----------------------+
|               SEQid|composantes_globale_id|
+--------------------+----------------------+
|         105906721:3|                     1|
|         106040287:0|                     1|
|         148798868:4|                     1|
|         149109793:4|                     1|
|          18808706:1|                     1|
|           1909920:3|                     1|
|          19191844:2|                     1|
|          29456800:5|                     1|
|           3704555:2|                     1|
|          39381824:2|                     1|
|          42532231:2|                     1|
|          47384083:1|                     1|
|          51953996:2|                     1|
|          68833237:2|                     1|
|          32073629:3|                     2|
|          38147670:5|                     2|
|          43393003:0|                     2|
|          49292152:2|                     2|
|          50649042:1|            

In [115]:
max_value = merged_df_composantes_partielles_unique_composantes_connexes_nettoye.agg(max("composantes_globale_id")).collect()[0][0]
max_value

10695

In [116]:
merged_df_composantes_partielles_unique_composantes_connexes_nettoye.count()

19454

In [117]:
distinct_count = merged_df_composantes_partielles_unique_composantes_connexes_nettoye.select("composantes_globale_id").distinct().count()

print("Nombre de valeurs distinctes dans la colonne ma_colonne :", distinct_count)

Nombre de valeurs distinctes dans la colonne ma_colonne : 4909
