# Importation des bibliothéques

In [9]:
import pandas as pd
import glob

# Lecture des données

In [10]:
parquet_folder = "/data/bd/dataset/proteine/80_80/G99/graph/"

# Lire tous les fichiers Parquet dans une liste de DataFrames
dfs = [pd.read_parquet(fichier) for fichier in glob.glob(parquet_folder + '*.parquet')]

# Concaténer les DataFrames en un seul DataFrame
df = pd.concat(dfs, ignore_index=True)

# Afficher les premières lignes du DataFrame
df.head()

Unnamed: 0,query_id,query_length,target_id,target_length,match_length,percent_identity,e_value,relative_sim
0,40910035:3,86,30648403:2,1486,86,100.0,1.2999999999999998e-38,1.0
1,40910035:3,86,40725366:3,484,86,100.0,1.2999999999999998e-38,1.0
2,40910035:3,86,40725367:5,323,86,100.0,1.2999999999999998e-38,1.0
3,40910035:3,86,40787900:2,368,86,100.0,1.2999999999999998e-38,1.0
4,40910042:0,149,103459895:5,69,69,100.0,1.5e-28,1.0


In [11]:
df = df[['target_id', 'query_id']]
df.shape

(359798439, 2)

# Partionnement des arêtes en plusieurs sous ensembles

In [12]:
# Nombre de partitions
num_partitions = 200

# Ajout d'une nouvelle colonne 'partition' basée sur le modulo de seqID1
df['partition'] = df['target_id'].apply(lambda x: hash(x) % num_partitions)

# Afficher le DataFrame partitionné
df

Unnamed: 0,target_id,query_id,partition
0,30648403:2,40910035:3,4
1,40725366:3,40910035:3,180
2,40725367:5,40910035:3,55
3,40787900:2,40910035:3,28
4,103459895:5,40910042:0,67
...,...,...,...
359798434,TARA_AON_82_MAG_00305_000000004762.14.1,56790170:2,40
359798435,TARA_MED_95_MAG_00461_000000006687.16.1,56790170:2,160
359798436,METdb_00073-1-Transcript-7395.p2,56790187:3,177
359798437,METdb_00339-1-Transcript-12170.p2,56790187:3,35


In [13]:
partition_freq = df.groupby('partition').size().reset_index(name='count')
partitions = partition_freq['partition'].unique()
for partition in partitions:
    df_partition = df[df['partition'] == partition]
    file_path = f"/data/bd/dataset/proteine/80_80/G99/graph_nodeID_only_partitionned_200/partition_{partition}.parquet"
    df_partition.to_parquet(file_path, index=False, engine='pyarrow')

In [14]:
partition_freq

Unnamed: 0,partition,count
0,0,1725909
1,1,1862217
2,2,1774525
3,3,1864658
4,4,1743204
...,...,...
195,195,1791122
196,196,1738911
197,197,1755436
198,198,1726996
