##Prends en entrée un dossier contenant les fichiers textes et ressort des fichiers textes contenant les triplets


test enrichissement


In [2]:
from transformers import pipeline
import os
import re
import json

# Initialize REBEL pipeline
triplet_extractor = pipeline('text2text-generation',
                             model='Babelscape/rebel-large',
                             tokenizer='Babelscape/rebel-large')

# Relation mapping dictionary
relation_mapping = {
    "participated in": ["competed in", "participant in", "sports discipline competed in"],
    "sport": ["sports discipline", "field of work"],
    "located in": ["located in or next to body of water", "located in the administrative territorial entity"],
    "point in time": ["follows", "followed by", "point in time"],
    "instance of": ["is a list of", "instance of"],
}

# Function to chunk text into smaller parts
def chunk_text_by_sentences(text, sentences_per_chunk=3):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    chunks = [' '.join(sentences[i:i + sentences_per_chunk]) for i in range(0, len(sentences), sentences_per_chunk)]
    return chunks

# Function to extract triplets from the output text
def extract_triplets(text):
    triplets = []
    relation, subject, object_ = '', '', ''
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
    return triplets

# Function to remap relations
def remap_relations(triplets, mapping):
    for triplet in triplets:
        for normalized_relation, synonyms in mapping.items():
            if triplet['type'] in synonyms:
                triplet['type'] = normalized_relation
                break
    return triplets

# Function to process text and extract triplets
def process_text(text, sentences_per_chunk=1, num_beams=10, max_length=256):
    text_chunks = chunk_text_by_sentences(text, sentences_per_chunk=sentences_per_chunk)
    all_triplets = []
    for chunk in text_chunks:
        extracted_text = triplet_extractor.tokenizer.batch_decode([
            triplet_extractor(chunk,
                              return_tensors=True,
                              return_text=False,
                              num_beams=num_beams,
                              early_stopping=False,
                              max_length=max_length)[0]["generated_token_ids"]
        ])
        chunk_triplets = extract_triplets(extracted_text[0])
        all_triplets.extend(chunk_triplets)
    all_triplets = remap_relations(all_triplets, relation_mapping)
    return all_triplets

# Remove duplicate triplets
def remove_duplicate_triplets(triplets):
    unique_triplets = {frozenset(triplet.items()): triplet for triplet in triplets}
    return list(unique_triplets.values())

if __name__ == "__main__":
    input_folder = os.path.join(os.getcwd(), "modified_text")
    output_folder = os.path.join(os.getcwd(), "output_triplets")
    os.makedirs(output_folder, exist_ok=True)

    if not os.path.exists(input_folder):
        print(f"Input folder not found: {input_folder}")
    else:
        for file_name in os.listdir(input_folder):
            if file_name.endswith(".txt"):
                input_path = os.path.join(input_folder, file_name)
                output_path = os.path.join(output_folder, f"triplets_{file_name}")

                with open(input_path, 'r', encoding='utf-8') as file:
                    text = file.read().strip()

                triplets = process_text(text)
                unique_triplets = remove_duplicate_triplets(triplets)

                with open(output_path, 'w', encoding='utf-8') as output_file:
                    json.dump(unique_triplets, output_file, ensure_ascii=False, indent=4)

                print(f"Processed {file_name} and saved results to {output_path}")


Device set to use cuda:0


Processed page_18.txt and saved results to /content/output_triplets/triplets_page_18.txt


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed page_8.txt and saved results to /content/output_triplets/triplets_page_8.txt
Processed page_16.txt and saved results to /content/output_triplets/triplets_page_16.txt
Processed page_22.txt and saved results to /content/output_triplets/triplets_page_22.txt
Processed page_15.txt and saved results to /content/output_triplets/triplets_page_15.txt
Processed page_11.txt and saved results to /content/output_triplets/triplets_page_11.txt
Processed page_13.txt and saved results to /content/output_triplets/triplets_page_13.txt
Processed page_20.txt and saved results to /content/output_triplets/triplets_page_20.txt
Processed page_10.txt and saved results to /content/output_triplets/triplets_page_10.txt
