In [1]:
import spacy
import spacy_component

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )


<spacy_component.RebelComponent at 0x7f7d633b8490>

In [19]:
import pickle

def save_kb(kb, filename):
    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res

In [2]:
def divide_chunks(l, n):
     
    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [3]:
def get_length(fn):
    ln = 0
    with open(fn, "r") as rf:
        r_text = rf.readlines()
        ln = len(r_text)
    return ln

In [4]:
# knowledge base class
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head_span", "relation", "tail_span"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [5]:
! ls ../../text_processing/data/wiki_texts/ped_pt | wc -l

13615


In [6]:
data_path = "/home/xtest/projects/text_processing/data/"

In [7]:
import glob

In [8]:
file_names_wi_p = [f for f in glob.glob(f"{data_path}/wiki_texts/ped_pt/*.txt")]
print(len(file_names_wi_p))

13615


In [9]:
new_br = [f for f in glob.glob(f"{data_path}/britanica_texts/ped_pt/*.txt") if get_length(f) < 30000]
len(new_br)

992

In [10]:
x = list(divide_chunks(file_names_wi_p, 4600))
print(len(x))
print(len(x[0]))

3
4600


In [11]:
extracted_files = []

In [12]:
search_files = list(set(x[0]) - set(extracted_files))
len(search_files)

4600

In [15]:
from tqdm.notebook import tqdm

In [16]:
kb = KB()

In [17]:
count = 0
for fn in tqdm(search_files):
    count += 1
#     print(f"Processing file {count} of {len(search_files)}")
    with open(fn, 'r') as f:
        lines = [line for line in f.readlines() if line.strip()]
    for l in lines:
        doc = nlp(l)
        doc_list = nlp.pipe([l])
        for value, rel_dict in doc._.rel.items():
            # print(f"{value}: {rel_dict}")
            kb.add_relation(rel_dict)
    extracted_files.append(fn)

  0%|          | 0/4600 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
count

4600

In [21]:
! pwd

/home/xtest/projects/KnowledgeGraph/rebel


In [28]:
kb.relations[0]['relation']

'located in the administrative territorial entity'

In [35]:
new_rel_list = [
    {
        'relation': str(item['relation']),  # Keep the 'relation' unchanged
        'head': str(item['head_span']),     # Change 'head_span' key to 'head'
        'tail': str(item['tail_span'])      # Change 'tail_span' key to 'tail'
    }
    for item in kb.relations
]

In [36]:
new_rel_list

[{'relation': 'located in the administrative territorial entity',
  'head': 'Essex',
  'tail': 'East of England'},
 {'relation': 'contains administrative territorial entity',
  'head': 'East of England',
  'tail': 'Essex'},
 {'relation': 'shares border with',
  'head': 'Suffolk',
  'tail': 'Cambridgeshire'},
 {'relation': 'shares border with',
  'head': 'Suffolk',
  'tail': 'Greater London'},
 {'relation': 'shares border with',
  'head': 'Cambridgeshire',
  'tail': 'Suffolk'},
 {'relation': 'shares border with',
  'head': 'Cambridgeshire',
  'tail': 'Hertfordshire'},
 {'relation': 'shares border with',
  'head': 'Hertfordshire',
  'tail': 'Cambridgeshire'},
 {'relation': 'shares border with', 'head': 'Hertfordshire', 'tail': 'Kent'},
 {'relation': 'shares border with', 'head': 'Kent', 'tail': 'Hertfordshire'},
 {'relation': 'located in or next to body of water',
  'head': 'Kent',
  'tail': 'River Thames'},
 {'relation': 'shares border with', 'head': 'Kent', 'tail': 'Greater London'},
 

In [37]:
len(new_rel_list)

563653

In [38]:
import json
with open("kb_ped_wiki_0.json", "w") as json_file:
    json.dump(new_rel_list, json_file)

In [39]:
# Read the JSON data from the file
with open("kb_ped_wiki_0.json", "r") as json_file:
    loaded_data_0 = json.load(json_file)

In [40]:
with open("kb_ped_wiki_1.json", "r") as json_file:
    loaded_data_1 = json.load(json_file)

In [41]:
with open("kb_ped_wiki_2.json", "r") as json_file:
    loaded_data_2 = json.load(json_file)

In [50]:
import copy
all_relations = copy.deepcopy(loaded_data_0)
# all_relations

In [51]:
def dict_to_tuple(d):
    return tuple(sorted(d.items()))

In [52]:
# Combine the two lists
combined_list = loaded_data_0 + loaded_data_1 + loaded_data_2

# Convert list of dictionaries to a set of tuples
unique_tuples = set(map(dict_to_tuple, combined_list))

# Convert set of tuples back to a list of dictionaries
unique_combined_list = [dict(t) for t in unique_tuples]

In [53]:
len(unique_combined_list)

1404867

In [44]:
# def are_relations_equal(r1, r2):
#         return all(r1[attr] == r2[attr] for attr in ["head", "relation", "tail"])

# def exists_relation(r1):
#     return any(are_relations_equal(r1, r2) for r2 in all_relations)

# def add_relation(r):
#     if not exists_relation(r):
#         all_relations.append(r)

In [54]:
unique_combined_list

[{'head': 'Medical Payments', 'relation': 'subclass of', 'tail': 'insurance'},
 {'head': 'Georges Couthon', 'relation': 'family', 'tail': 'Couthon'},
 {'head': 'Hong Kong', 'relation': 'language used', 'tail': 'Chinese'},
 {'head': 'intro', 'relation': 'has part', 'tail': 'citation-needed tag'},
 {'head': 'Mark Taper Forum',
  'relation': 'located in the administrative territorial entity',
  'tail': 'Los Angeles'},
 {'head': 'Mekong', 'relation': 'basin country', 'tail': 'Cambodia'},
 {'head': 'Jack Rabbit Racer',
  'relation': 'end time',
  'tail': '1930.[citation'},
 {'head': 'Persistent vandalism',
  'relation': 'subclass of',
  'tail': 'disruptive edits'},
 {'head': 'Rhineland',
  'relation': 'dissolved, abolished or demolished date',
  'tail': '4 December 1918'},
 {'head': 'Filming',
  'relation': 'start time',
  'tail': 'September 23, 2006.[42'},
 {'head': 'Chogha Zanbil', 'relation': 'family', 'tail': 'Zanbil'},
 {'head': 'Annunciation', 'relation': 'has part', 'tail': "Mary's C

In [55]:
with open("kb_ped_wiki_all.json", "w") as json_file:
    json.dump(unique_combined_list, json_file)