In [7]:
import networkx as nx
import random

def load_graph_from_txt(file_path):
    G = nx.DiGraph() 
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 3:
                head, relation, tail = parts
                G.add_edge(head, tail, relation=relation)
    return G


def directed_random_walk_sampling(graph, seed_nodes, walk_length=8, sample_size=20000, p_restart=0.7):

    """
    在有向图上执行改进的随机游走采样
    - 避免走到无出度节点时卡住
    - 允许一定概率随机跳跃到新的种子节点
    """
    sampled_nodes = set(seed_nodes)

    for node in seed_nodes:
        current_node = node
        for _ in range(walk_length):
            if random.random() < p_restart:
                current_node = random.choice(seed_nodes)  # 以一定概率重置到起始节点
            else:
                neighbors = list(graph.successors(current_node))  # 只看出边
                if not neighbors:
                    break  # 没有出边就终止当前游走
                current_node = random.choice(neighbors)

            sampled_nodes.add(current_node)
            if len(sampled_nodes) >= sample_size:
                break
        if len(sampled_nodes) >= sample_size:
            break

    return graph.subgraph(sampled_nodes)

G=load_graph_from_txt("KG_train.txt")
num_hubs = 300  #inclure ici les n premiers grand noeud qui porte beaucoup darc sortant.
high_outdegree_nodes = sorted(G.out_degree, key=lambda x: x[1], reverse=True)[:num_hubs]
hub_seeds = [node for node, _ in high_outdegree_nodes]
print(hub_seeds)
valid_seeds = [node for node in G.nodes() if len(list(G.successors(node))) > 0]
seed_nodes1 = random.sample(valid_seeds, 500)  # ici on choisit les nombres voulus des autres point de part.
seed_nodes = hub_seeds + seed_nodes1# la somme des des deux noeuds initiaux forme la taille de graphe. vous pouvez changer comme vous voulez



G_sampled = directed_random_walk_sampling(G, seed_nodes, walk_length=5, sample_size=20000, p_restart=0.7)# ici vous choisiez les parametres de echantilloner
#walk length plus grand vous aurez le graph plus global
#p restard pour resortir quand on rencontre un chemin termine ou un entite avec beaucoup darc sortant.



print(f"Graph sampled has {G_sampled.number_of_nodes()} nodes and {G_sampled.number_of_edges()} edges")
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")



def save_triplets_to_txt(edges, file_path="triplets.txt"):
    with open(file_path, "w", encoding="utf-8") as f:
        for head, relation, tail in edges:
            f.write(f"{head}\t{relation}\t{tail}\n")

edges = [(u, d["relation"], v, ) for u, v, d in G_sampled.edges(data=True)]

save_triplets_to_txt(edges)

['/m/08mbj5d', '/m/04ztj', '/m/08mbj32', '/m/02hrh1q', '/m/09c7w0', '/m/07s9rl0', '/m/01d_h8', '/m/0dxtg', '/m/0ch6mp2', '/m/02vxfw_', '/m/09zzb8', '/m/02sdk9v', '/m/02nzb8', '/m/02_j1w', '/m/02r96rf', '/m/0342h', '/m/09vw2b7', '/m/09l65', '/m/0dgrmp', '/m/02jknp', '/m/060c4', '/m/05p553', '/m/07ssc', '/m/09jwl', '/m/03gjzk', '/m/014mlp', '/m/01jfsb', '/m/05r5c', '/m/0gq9h', '/m/02_286', '/m/09sb52', '/m/02hcv8', '/m/05148p4', '/m/0147fv', '/m/02kdv5l', '/m/0cbd2', '/m/02l7c8', '/m/06by7', '/m/0gs9p', '/m/04n6k', '/m/02vx4', '/m/0gqy2', '/m/018vs', '/m/03k9fj', '/m/0dxtw', '/m/0nbcg', '/m/02h4rq6', '/m/0f4x7', '/m/0ckd1', '/m/0gr4k', '/m/086k8', '/m/019v9k', '/m/0np9r', '/m/064t9', '/m/01vx2h', '/m/019f4v', '/m/041rx', '/m/0gqyl', '/m/01pvkk', '/m/01g63y', '/m/0gq_v', '/m/02hnl', '/m/01hmnh', '/m/04xvlr', '/m/040njc', '/m/0gqwc', '/m/0l14md', '/m/01z4y', '/m/0gr51', '/m/03bx0bm', '/m/0kyk', '/m/0dz3r', '/m/0k611', '/m/016z4k', '/m/0gr0m', '/m/0lsxr', '/m/018gz8', '/m/02_xgp2', '/m/0l8z

In [7]:
import pickle

def load_pkl_preview(file_path, num_elements=10):
    with open(file_path, "rb") as f:
        data = pickle.load(f)

    print(f"Type des données : {type(data)}")

    if isinstance(data, list) or isinstance(data, tuple):
        print(f"Nombre total d'éléments : {len(data)}")
        print(f"Aperçu ({num_elements} premiers éléments) : {data[:num_elements]}")

    elif isinstance(data, dict):
        keys = list(data.keys())[:num_elements]
        print(f"Clés principales : {keys}")
        print(f"Aperçu des valeurs : {[data[k] for k in keys]}")

    else:
        print("Type de données non géré, affichage brut :")
        print(str(data)[:1000]) 

file_path = "LMET_train.pkl" 
load_pkl_preview(file_path)


Type des données : <class 'list'>
Nombre total d'éléments : 14808
Aperçu (10 premiers éléments) : [([[4606, 1663, 16272], [4606, 1663, 18198], [4606, 1669, 18401], [4606, 1368, 15578], [4606, 1469, 17326], [4606, 1401, 15034], [4606, 1663, 16936], [4606, 1358, 15023], [4606, 1358, 18087], [4606, 1469, 15916], [4606, 1669, 18300]], [[4606, 1180, 10783], [4606, 2716, 10783], [4606, 337, 8135], [4606, 3618, 8135], [4606, 899, 8135], [4606, 537, 5864], [4606, 2716, 9766], [4606, 1180, 9766], [4606, 3702, 7240], [4606, 1177, 7240], [4606, 549, 5806], [4606, 2953, 5806], [4606, 3672, 11999], [4606, 546, 11999], [4606, 1205, 2444], [4606, 2586, 2444], [4606, 195, 7434], [4606, 1318, 7434], [4606, 84, 7434], [4606, 1168, 7434], [4606, 537, 11342], [4606, 3702, 10091], [4606, 1177, 10091], [4606, 899, 14810], [4606, 3618, 14810], [4606, 1180, 10465], [4606, 2716, 10465], [4606, 2586, 5310], [4606, 1205, 5310], [4606, 899, 8688], [4606, 3618, 8688], [4606, 2516, 10908], [4606, 101, 10908], [4606

In [9]:
import numpy as np

def load_npy_preview(file_path, num_elements=5):
    data = np.load(file_path, allow_pickle=True)  # Charge le fichier

    print(f"Type des données : {type(data)}")
    print(f"Shape : {data.shape}, Dtype : {data.dtype}")

    # Afficher un extrait en fonction de la dimension
    if data.ndim == 1:  # Tableau 1D
        print(f"Aperçu ({num_elements} premiers éléments) :\n{data[:num_elements]}")
    elif data.ndim == 2:  # Matrice 2D
        print(f"Aperçu ({num_elements} premières lignes) :\n{data[:num_elements, :]}")
    elif data.ndim == 3:  # Tensor 3D (ex. images)
        print(f"Aperçu de la première tranche :\n{data[0]}")
    else:
        print("Les données ont plus de 3 dimensions, affichage limité.")
        print(f"Aperçu : {data.flat[:num_elements]}")  # Affichage partiel

# Exemple d'utilisation
file_path = "ent2pair.npy"  # Remplace par ton fichier
load_npy_preview(file_path)


Type des données : <class 'numpy.ndarray'>
Shape : (14951,), Dtype : object
Aperçu (5 premiers éléments) :
[list([[463, 3340], [77, 980], [822, 2380], [131, 2145], [131, 1218], [318, 1395], [463, 1022], [8, 2525], [131, 2582], [13, 1304]])
 list([[23, 3070], [23, 1839], [1074, 3554], [23, 1159], [150, 2132], [23, 2055], [23, 3125], [23, 627], [58, 3386]])
 list([[172, 837], [24, 606]])
 list([[182, 2350], [8, 1847], [56, 83], [78, 363], [24, 2336], [182, 577], [24, 34], [44, 3345], [8, 68], [476, 1064], [31, 1333], [567, 2097]])
 list([[44, 2873], [44, 3481], [134, 2572]])]


In [None]:
def filter_et_train(sampled_triplets_file, et_train_file, output_file):
    entities_in_sampled = set()
    with open(sampled_triplets_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 3:
                entities_in_sampled.add(parts[0])  

    with open(et_train_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
        for line in fin:
            parts = line.strip().split("\t")
            if len(parts) == 2 and parts[0] in entities_in_sampled:  
                fout.write(line)

filter_et_train("triplets.txt", "ET_train.txt", "et_train_sampled.txt")


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
No module named 'tensorflow'