In [6]:
%pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Using cached transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB ? eta 0:00:00
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Using cached huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.20.0->sentence_transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence_transf


[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd
import os
from pathlib import Path
import json
from module.utils import *
from sampling.mhsk_utils import *
import math
from tqdm import tqdm

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

if torch.cuda.is_available():
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Forces GPU

print(torch.cuda.is_available())

# Specify locations for loading and saving data
project_folder = os.getcwd()
data_path = os.path.join(project_folder, "data", "FB15kET_sample")
result_folder = os.path.join(project_folder, "data_entity_metrics")

# Load Ids and descriptions
e2id = read_id(os.path.join(data_path, 'entities.tsv'))
r2id = read_id(os.path.join(data_path, 'relations.tsv'))
t2id = read_id(os.path.join(data_path, 'types.tsv'))
c2id = read_id(os.path.join(data_path, 'clusters.tsv'))

e2desc, e2text = read_entity_wiki(os.path.join(data_path, 'entity_wiki.json'), e2id, 'hybrid')
r2text = read_rel_context(os.path.join(data_path, 'relation2text.txt'), r2id)
t2desc = read_type_context(os.path.join(data_path, 'hier_type_desc.txt'), t2id)

# Load KG data
df_triples = pd.read_csv(os.path.join(data_path, "KG_train.txt"), sep='\t', header=None)
df_train = pd.read_csv(os.path.join(data_path, "ET_train.txt"), sep='\t', header=None)

with open(os.path.join(data_path, 'entity_wiki.json'), "r") as f:
    entity_labels = json.load(f)

# Recompute coherence metrics for entities
if not os.path.exists(os.path.join(result_folder, "entity_metrics_sample.csv")):
    recompute_similarity(df_triples, df_train, r2text, r2id, e2desc, e2id, t2desc, t2id, result_folder)

# Load entity coherence metrics
e_coherence = pd.read_csv(os.path.join(result_folder, "entity_metrics_sample.csv"))

# Entities with degree considered too low
lowerb_quantiles = e_coherence[['kg_degree', 'et_degree']].quantile(0.1)
low_kg_degree = lowerb_quantiles.loc['kg_degree']
low_et_degree = lowerb_quantiles.loc['et_degree']

# Filter only entities with bad metrics
entity_low_degree_df = e_coherence[(e_coherence["kg_degree"] <= low_kg_degree) \
                              & (e_coherence["et_degree"] <= lowerb_quantiles.loc['et_degree'])]
entity_low_degree = entity_low_degree_df['entity'].to_list()

entity_kg_2hop = []
entity_et_2hop = []
for entity in tqdm(entity_low_degree, total=len(entity_low_degree), desc="Processing entities", unit="entity"):
    
    # Current sentences
    kg_entity_text, _ = kg_sentences(df_triples, entity, r2text, r2id, e2desc, e2id)
    et_train_sentences, _ = et_sentences(df_train, entity, t2desc, t2id)
    entity_sentences = kg_entity_text + et_train_sentences

    # Number 2-hop relations and types added
    n_rel = int(round(low_kg_degree * (1 - len(kg_entity_text) / (low_kg_degree+1))) + 2)
    n_type = int(round(low_et_degree*2* (1 - len(et_train_sentences) / (low_et_degree+1))) + 2)
    
    # 2-hop neighbor sentences
    kg_2hop, kg_2hop_sentences = two_hop_neighbors(df_triples, entity, r2text, r2id, e2desc, e2id)
    types_2hop, et_txt_2hop = two_hop_types(df_triples, df_train, entity, t2desc, t2id)

    # 2-hop kg neighbor and type with highest average similarity score
    kg_top_2hop = max_sim_2hop(entity_sentences, kg_2hop_sentences, kg_2hop, n_rel)
    et_top_2hop = max_sim_2hop(entity_sentences, et_txt_2hop, types_2hop, n_type, kg=False)

    # Store best results for additional information
    for relation, entity2, direction in kg_top_2hop:
        if direction == '-':
            entity_kg_2hop.append((entity, relation, entity2))
        elif direction == 'inv':
            entity_kg_2hop.append((entity2, relation, entity))

    save_entity_kg_2hop(entity_kg_2hop, os.path.join(data_path, 'relation2hop.tsv'))

    for type in et_top_2hop:
        entity_et_2hop.append((entity, type))


# Entities with degree considered too high
upperb_quantiles = e_coherence[['kg_degree', 'et_degree']].quantile(0.90)
high_kg_degree = upperb_quantiles.loc['kg_degree']
high_et_degree = upperb_quantiles.loc['et_degree']

# Filter only entities with bad metrics
entity_high_degree_df = e_coherence[(e_coherence["kg_degree"] > high_kg_degree) \
                              & (e_coherence["et_degree"] > high_et_degree)]
entity_high_degree = entity_high_degree_df['entity'].to_list()

# removed results dataframe
kg_train_removed_df = pd.DataFrame()
et_train_removed_df = pd.DataFrame()

for entity in tqdm(entity_high_degree, total=len(entity_high_degree), desc="Processing entities", unit="entity"):

    # Current sentences
    kg_entity_text, neighbors = kg_sentences(df_triples, entity, r2text, r2id, e2desc, e2id, filter=False)
    et_train_sentences, et_train = et_sentences(df_train, entity, t2desc, t2id)
    entity_sentences = kg_entity_text + et_train_sentences

    # Remove noisy relationships and types
    n_kg_remove = int(math.ceil(len(kg_entity_text)*0.1))
    n_et_remove = int(math.ceil(len(et_train_sentences)*0.1))

    # Remove noisy neighbors through similarity score
    kg_train_removed, et_train_removed = remove_noisy_neighbors(kg_entity_text, neighbors, et_train_sentences, et_train, n_kg_remove, n_et_remove)

    # Store removed results
    kg_train_removed_df = pd.concat([kg_train_removed_df, kg_train_removed], axis=0).reset_index(drop=True)
    et_train_removed_df = pd.concat([et_train_removed_df, et_train_removed], axis=0).reset_index(drop=True)

# Update KG_train and ET_train without noise relations
kg_train_new = df_triples.merge(kg_train_removed_df, on=[0, 1, 2], how='left', indicator=True)
kg_train_new = kg_train_new[kg_train_new['_merge'] == 'left_only'].drop(columns=['_merge'])
et_train_new = df_train.merge(et_train_removed_df, how='left', indicator=True)
et_train_new = et_train_new[et_train_new['_merge'] == 'left_only'].drop(columns=['_merge'])

# Convert 2-hop additions in dataframe
kg_train_2hop = pd.DataFrame(entity_kg_2hop, columns=[0,1,2])
et_train_2hop = pd.DataFrame(entity_et_2hop, columns=[0,1])

# Final processed train files
kg_train_processed = pd.concat([kg_train_new, kg_train_2hop], axis=0).reset_index(drop=True)
et_train_processed = pd.concat([et_train_new, et_train_2hop], axis=0).reset_index(drop=True)

# Save files
data_sample_dir_2hop = os.path.join(project_folder, "data", f"FB15kET_sample_2hop")
os.makedirs(data_sample_dir_2hop, exist_ok=True)
kg_train_processed.to_csv(os.path.join(data_sample_dir_2hop, "KG_train.txt"), sep='\t', header=None, index=False)
et_train_processed.to_csv(os.path.join(data_sample_dir_2hop, "ET_train.txt"), sep='\t', header=None, index=False)


NameError: name 'init_empty_weights' is not defined

In [12]:
%pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
