# Process GPAHE data

In [1]:
import os
import pandas as pd
import re
import spacy
from IPython.display import Image
from wikidata.client import Client
from IPython.display import clear_output

In [2]:
DATA_FILE = "../../data/scraped_symbol_dict.json"
data_df = pd.read_json(DATA_FILE).T

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: 
        print(text)

## 1. Cluster hate symbols

Use https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html as example

In [None]:
import matplotlib.pyplot as plt
import regex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
def get_text_data(data_df):
    texts = []
    for _, row in data_df.iterrows():
        description = row["Description"]
        ideology = ". ideology is " + regex.sub(",", " . ideology is", row["Ideology"])
        location = ". location is " + regex.sub(",", " . location is", row["Location"])
        texts.append(" ".join(word_tokenize(" ".join([description, ideology, location]))))
    return texts

Interesting words for clustering:
* chapter
* club/klub
* group
* organization
* party
* proud (boys)

In [None]:
def vectorize_text_data(texts, nbr_of_dimensions=5):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    vectorized_data = TruncatedSVD(n_components=nbr_of_dimensions, n_iter=5, random_state=42)
    vectorized_data.fit(X.T)
    return vectorized_data

In [None]:
def visualize(texts, vectorized_data, labels, dimension_1=0, dimension_2=1):
    try:
        x = vectorized_data.components_[dimension_1]
        y = vectorized_data.components_[dimension_2]
    except:
        raise ValueError(f"invalid pair of dimensions ({dimension_1}, {dimension_2})")
    split_data, labels = split_data_by_content(x, y, texts, labels)
    plt.figure(figsize=(24, 12))
    for color in sorted(split_data.keys(), reverse=True):
        plt.scatter(split_data[color][0], split_data[color][1], c=color, label=color, alpha=0.5)
    for i in range(0, len(x)):
        plt.annotate(str(i), (x[i], y[i]))
    plt.legend(labels=[ label for color, label in sorted(labels.items(), reverse=True) ] )
    plt.title(f"{len(x)} hate symbols clustered by description, ideology and location")
    plt.savefig("gpahe_process.png")
    plt.show()

In [None]:
red_token = "proud boys"
orange_token = "chapter"
yellow_token = "club"
green_token = "group"
other_color = "blue"
labels = { "yellow": yellow_token, "red": red_token, "orange": orange_token, "green": green_token, other_color: "other" }

def split_data_by_content(x, y, texts, labels):
    split_data = { other_color: [[], []], "green": [[], []], "orange": [[], []], "red": [[], []], "yellow": [[], []] }
    for i in range(0, len(x)):
        if regex.search(red_token, texts[i], regex.IGNORECASE):
            split_data["red"][0].append(x[i])
            split_data["red"][1].append(y[i])
        elif regex.search(orange_token, texts[i], regex.IGNORECASE):
            split_data["orange"][0].append(x[i])
            split_data["orange"][1].append(y[i])
        elif regex.search(yellow_token, texts[i], regex.IGNORECASE):
            split_data["yellow"][0].append(x[i])
            split_data["yellow"][1].append(y[i])
        elif regex.search(green_token, texts[i], regex.IGNORECASE):
            split_data["green"][0].append(x[i])
            split_data["green"][1].append(y[i])
        else:
            split_data[other_color][0].append(x[i])
            split_data[other_color][1].append(y[i])
    for color in labels:
        labels[color] += f" ({len(split_data[color][0])})"
    return split_data, labels

In [None]:
texts = get_text_data(data_df)
vectorized_data = vectorize_text_data(texts)
visualize(texts, vectorized_data, labels.copy(), dimension_1=1, dimension_2=3)

In [None]:
# data_df.iloc[422]["Description"]

## 2. Make a knowledge base

* https://medium.com/nlplanet/building-a-knowledge-base-from-texts-a-full-practical-example-8dbbffb912fa
* https://neo4j.com/blog/text-to-knowledge-graph-information-extraction-pipeline/
* https://neo4j.com/developer-blog/construct-knowledge-graphs-unstructured-text/
* https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a

## 3. Extract entities with Spacy

In [None]:
SPACY_FIELD_NAMES = "CARDINAL DATE EVENT FAC GPE LANGUAGE LAW LOC MONEY NOMINAL NORP ORDINAL ORG PERCENT PERSON PRODUCT QUANTITY TIME WORK_OF_ART".split()

nlp = spacy.load('en_core_web_sm') 

In [None]:
field_values = { "organizations": ["NORP", "ORG" ],
                 "locations": [ "FAC", "GPE", "LOC", ], 
                 "events": [ "EVENT", ],
                 "dates": [ "DATE", "TIME", ], }

In [None]:
def get_entities(nlp_analysis):
    entities = {}
    for entity in nlp_analysis.ents:
        if entity.label_ not in entities:
            entities[entity.label_] = []
        entities[entity.label_].append(entity.text)
    return entities

In [None]:
def nlp_analysis(texts, nlp):
    entity_data = []
    for text in texts:
        nlp_analysis = nlp(text) 
        entities = get_entities(nlp_analysis)
        entity_data.append(entities)
    return entity_data

In [None]:
def count_entity_groups(entity_data, entity_group_name, n=10):
    values = {}
    for entities in entity_data:
        for entity_name in entities:
            if entity_name in field_values[entity_group_name]:
                for entity in entities[entity_name]:
                    if entity in values:
                        values[entity] += 1
                    else:
                        values[entity] = 1
    print(f"{entity_group_name} ({sum(values.values())}):", [[key, value] for key, value in sorted(values.items(), key=lambda item: item[1], reverse=True)][:n])

In [None]:
def count_entities(entity_data, target_entity_name, n=10):
    values = {}
    for entities in entity_data:
        for entity_name in entities:
            if entity_name == target_entity_name:
                for entity in entities[entity_name]:
                    if entity in values:
                        values[entity] += 1
                    else:
                        values[entity] = 1
    print(f"{target_entity_name} ({sum(values.values())}):", [[key, value] for key, value in sorted(values.items(), key=lambda item: item[1], reverse=True)][:n])

In [None]:
entity_data = nlp_analysis(texts, nlp)

In [None]:
for field_name in SPACY_FIELD_NAMES:
    count_entities(entity_data, field_name)

In [None]:
count_entity_groups(entity_data, "organizations")

In [None]:
count_entity_groups(entity_data, "locations")

In [None]:
count_entity_groups(entity_data, "events")

In [None]:
count_entity_groups(entity_data, "dates")

## 4. Extract entities with REBEL

Based on blog by Fabio Chiusano: https://medium.com/nlplanet/building-a-knowledge-base-from-texts-a-full-practical-example-8dbbffb912fa

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

# source: https://gist.githubusercontent.com/fabiochiusano/934ad5ff318626befbdd20c72e074186/raw/e3e44110a0db5408d17fba52be559ecaf676b6d2/kb_4.py

In [None]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

# source: https://gist.githubusercontent.com/fabiochiusano/e64d5250371e18f7a6cc02ac0cdc64c5/raw/24af0f7f23b313591fe91fc9f8826cf216ca4568/kb_5.py

In [None]:
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

# source: https://gist.githubusercontent.com/fabiochiusano/ceec4d9ff1ce2ad25c40fbd8412aa9e4/raw/796771f88776fca9d7c4c84bd1b3a52d9ef5b5c1/kb_6.py

In [None]:
def extract_relations_per_text(texts):
    relations_per_text = []
    for text in texts:
        relations = []
        for sentence in sent_tokenize(text):
            kb = from_small_text_to_kb(sentence, verbose=True)
            relations.extend(kb.__dict__["relations"])
        relations_per_text.append(relations)
        squeal(f"{len(relations_per_text)}: {sum([len(relations) for relations in relations_per_text])/len(relations_per_text)}")
    return relations_per_text

# source for line 6: https://gist.githubusercontent.com/fabiochiusano/a720da218ee8d19de3130fa36c23a69b/raw/a9b94a3ddbad61cfb3713234476423fffbfdca41/kb_7.py

In [None]:
def count_relations(relations_per_text):
    relations_count = {}
    for relations in relations_per_text:
        for relation in relations:
            key = "#".join([relation["head"], relation["type"], relation["tail"]])
            if key in relations_count:
                relations_count[key] += 1
            else:
                relations_count[key] = 1
    return relations_count

In [None]:
def convert_relations_to_df(relations_per_text):
    relations_list = []
    for counter in range(0, len(relations_per_text)):
        for relation in relations_per_text[counter]:
            relation["text_id"] = counter
            relations_list.append(relation)
    relations_df = pd.DataFrame(relations_list)
    return relations_df

In [None]:
def sort_dict_of_lists(dict_of_lists):
    return [ (len(list), key) for key, list in sorted(dict_of_lists.items(), key=lambda x: len(x[1]), reverse=True)]

In [None]:
def count_duplicates(relations_df):
    nbr_of_duplicates = 0
    seen = {}
    for index, relation in relations_df.iterrows():
        key = "#".join([relation["head"], relation["type"], relation["type"]])
        if key in seen:
            nbr_of_duplicates += 1
        seen[key] = True
    return nbr_of_duplicates

In [None]:
def count_relation_fields(relations_df):
    head_terms = {}
    tail_terms = {}
    type_head_terms = {}
    type_tail_terms = {}

    for index, relation in relations_df.iterrows():
        head = relation["head"]
        tail = relation["tail"]
        type = relation["type"]
        if head not in head_terms:
            head_terms[head] = []
        if tail not in head_terms[head]:
            head_terms[head].append(tail)
        if tail not in tail_terms:
            tail_terms[tail] = []
        if head not in tail_terms[tail]:
            tail_terms[tail].append(head)
        key = f"{type}({head},_)"
        if key not in type_head_terms:
            type_head_terms[key] = []
        if tail not in type_head_terms[key]:
            type_head_terms[key].append(tail)
        key = f"{type}(_,{tail})"
        if key not in type_tail_terms:
            type_tail_terms[key] = []
        if head not in type_tail_terms[key]:
            type_tail_terms[key].append(head)
    return head_terms, tail_terms, type_head_terms, type_tail_terms

In [None]:
relations_per_text = extract_relations_per_text(texts)

In [None]:
relations_count = count_relations(relations_per_text)
print(f"number of relations: {sum(relations_count.values())}; unique: {len(relations_count)}")

In [None]:
sort_dict_of_freqs(relations_count)[:10]

In [None]:
relations_df = convert_relations_to_df(relations_per_text)

In [None]:
relations_df.to_csv("gpahe_process.csv")

In [None]:
head_terms, tail_terms, type_head_terms, type_tail_terms = count_relation_fields(relations_df)

In [None]:
sort_dict_of_lists(head_terms)[:10]

In [None]:
sort_dict_of_lists(tail_terms)[:10]

In [None]:
sort_dict_of_lists(type_head_terms)[:10]

In [None]:
sort_dict_of_lists(type_tail_terms)[:10]

In [None]:
relations_df

## 5. Visualize knowlegde triples

To do: add labels to edges (note: an edge can have several labels)

In [None]:
import networkx as nx
from pyvis.network import Network

In [None]:
G = nx.from_pandas_edgelist(relations_df, source="head", target="tail")

In [None]:
net = Network(notebook=True)
net.from_nx(G)
net.show("gpahe_process.html")

## 6. Link to wikidata (fails)

In [None]:
client = Client()  # doctest: +SKIP
entity = client.get('Q20145', load=True)

In [None]:
entity

In [None]:
import pywikibot
from pywikibot import pagegenerators, WikidataBot

In [None]:
sparql = "SELECT ?item WHERE { ?item rdfs:label 'Google'@en }"
entities = pagegenerators.WikidataSPARQLPageGenerator(sparql)
entities = list(entities)

## 7. download logo's manually (lot of work)

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import webbrowser

In [None]:
base_url = "https://symbols.globalextremism.org/details?recordId="

In [None]:
COUNTER_MINIMUM = 80
counter = 0
for id in data_df.index:
    counter += 1
    if counter > COUNTER_MINIMUM:
       webbrowser.open(base_url + id, new=2)
       print(counter)
       input()

1. open the web page in a new tab of a browser
2. right click on the logo/image
3. open the image in a new tab of the browser
4. right click on the image
5. save the image with name number.extension
6. close the two added tabs
7. push the enter/return button on the notebook page
8. repeat for the next logo/image

In [None]:
download_dir = os.environ["HOME"] + "/Downloads"
download_files = os.listdir(download_dir)
extensions = [ "jpg", "png", "svg", "webp", "JPG", "PNG", "SVG", "WEBP" ]  
counter = 0
for id in data_df.index:
    counter += 1
    if counter > COUNTER_MINIMUM:
        break
    file_found = False
    for extension in extensions:
        if os.path.isfile(download_dir + "/" + str(counter) + "." + extension):
            file_found = True
            break
    if not file_found:
        print(f"cannot find file number {counter}!")

## 8. Link Spacy entities from ChatGPT output to GPAHE metedata

From the Spacy analysis we select all:
1. noun phrases
2. entities
3. tokens with pos tag PROPN

In [4]:
import regex

In [5]:
remove_chars = str.maketrans("*#", "  ")

def get_phrases(text, spacy_model):
    nlp_analysis = spacy_model(text.translate(remove_chars))
    chunk_texts = [ regex.sub("^[Tt][Hh][EeIi][Ss]* ", "", 
                        regex.sub("^[Aa][Nn]* ", "", chunk.text, 
                                  regex.IGNORECASE), 
                              regex.IGNORECASE) 
                     for chunk in nlp_analysis.noun_chunks
                  ]
    chunk_texts.extend([entity.text for entity in nlp_analysis.ents])
    chunk_texts.extend([token.text for token in nlp_analysis if token.pos_ == "PROPN" ])
    return chunk_texts

In [6]:
def get_term_list(data_df):
    term_dict = {}
    for index, row in data_df.iterrows():
        term_dict[row["Title"]] = True
        for ideology in row["Ideology"].split(","):
            term_dict[ideology.strip()] = True
        for location in row["Location"].split(","):
            term_dict[location.strip()] = True
    return sorted(term_dict.keys())

In [7]:
chatgpt_dir = "chatgpt"

def read_chatgpt_texts(chatgpt_dir):
    chatgpt_files = [ file_name for file_name in os.listdir(chatgpt_dir)
                      if regex.search("b.txt", file_name) ]
    chatgpt_texts = {}
    for file_name in sorted(chatgpt_files):
        file_handle = open(os.path.join(chatgpt_dir, file_name), "r")
        lines = file_handle.readlines()
        file_handle.close()
        chatgpt_texts["_".join([chatgpt_dir, file_name])] = " ".join(lines)
    return chatgpt_texts

In [9]:
def get_meme_text_from_chatgpt_text(text):
    meme_text = ""
    for line in text.split("\n"):
        if regex.search("\"", line):
            line = regex.sub('" and "', r"\n", line)
            line = regex.sub('^[^"]*"', "", line)
            line = regex.sub("\".*$", "", line)
            meme_text = line
            break
    return meme_text

In [8]:
spacy_model = spacy.load('en_core_web_sm')
term_list = get_term_list(data_df)
chatgpt_texts = read_chatgpt_texts(chatgpt_dir)

In [10]:
term_list_lower = [ term.lower() for term in term_list ]
meme_texts = {}
for file_name in sorted(chatgpt_texts.keys()):
    phrases = list(set(get_phrases(chatgpt_texts[file_name], spacy_model)))
    phrases_in_term_list = [ phrase for phrase in phrases if phrase.lower() in term_list_lower ]
    meme_texts["_".join([chatgpt_dir, file_name])] = get_meme_text_from_chatgpt_text(chatgpt_texts[file_name])
    print(phrases_in_term_list)

['4/20']
['neo-Nazi']
['Pepe the Frog']
['Nazi', 'Germany']


In [11]:
chatgpt_texts["chatgpt_1b.txt"]

'### 1. Interpretation of the Image\n The image shows a dog with a happy, slightly mischievous expression. The background is decorated with colorful, psychedelic patterns that resemble marijuana leaves, often associated with a state of altered consciousness or celebration.\n \n ### 2. Interpretation of the Text\n The text reads: "ITS GONNA BE 4/20 FOR A WHOLE MONTH." This is a play on the date April 20th (4/20), which is widely recognized in cannabis culture as a day for celebrating and consuming marijuana.\n \n ### 3. Interpretation of the Combination\n The combination of the happy, relaxed dog and the text implies a humorous and exaggerated scenario where the state of celebration and relaxation associated with 4/20 lasts for an entire month. The dog\'s expression, along with the colorful background, reinforces the playful and light-hearted tone of the message, suggesting an extended period of enjoyment and leisure.\n \n In essence, the meme is using humor to exaggerate the idea of an

## 9. Get WordNet synsets from ChatGPT texts

Uses ``get_meme_text_from_chatgpt_text`` from code block 8

In [12]:
import json
import requests
from rdflib import Graph
from nltk import word_tokenize, pos_tag, WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [13]:
g = Graph()
g.parse("../data/ontox_kg.ttl")

<Graph identifier=N29f7774bf4b64c3eb105e674e1485531 (<class 'rdflib.graph.Graph'>)>

In [14]:
modes = [ "visual", "textual", "combined" ]

def split_text(chatgpt_text):
    chatgpt_texts_split = { mode: "" for mode in modes }
    mode = 0
    for line in chatgpt_text.split("\n"):
        line = line.strip()
        if line == "### 1. Interpretation of the Image":
            mode = 0
        elif line == "### 2. Interpretation of the Text":
            mode = 1
        elif line == "### 3. Interpretation of the Combination":
            mode = 2
        else:
            chatgpt_texts_split[modes[mode]] += "\n" + line
    return chatgpt_texts_split

In [15]:
def get_wikidata_info(entity_label):
    """
    Function to query Wikidata API for a given entity label and return its QID and name.

    Args:
    - `entity_label` (str): Label of the entity to be queried.
    
    Returns:
    - `str`: QID of the entity.
    - `str`: Name of the entity.
    
    Dependencies:
    - `requests`: For querying Wikidata API.
    
    Output:
    - Returns the QID and name of the entity if found, otherwise returns `None`.
    
    """
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "limit": 100,
        "uselang": "en",
        "search": entity_label
    }
    response = requests.get(url, params=params)
    data = response.json()
    if 'search' in data.keys():
        return data["search"]
    else:
        return []

In [16]:
get_wikidata_info("Racial Holy War")

[{'id': 'Q15994152',
  'title': 'Q15994152',
  'pageid': 17611956,
  'concepturi': 'http://www.wikidata.org/entity/Q15994152',
  'repository': 'wikidata',
  'url': '//www.wikidata.org/wiki/Q15994152',
  'display': {'label': {'value': 'Racial Holy War', 'language': 'en'},
   'description': {'value': 'White Supremacist concept', 'language': 'en'}},
  'label': 'Racial Holy War',
  'description': 'White Supremacist concept',
  'match': {'type': 'label', 'language': 'en', 'text': 'Racial Holy War'}},
 {'id': 'Q77977913',
  'title': 'Q77977913',
  'pageid': 77436094,
  'concepturi': 'http://www.wikidata.org/entity/Q77977913',
  'repository': 'wikidata',
  'url': '//www.wikidata.org/wiki/Q77977913',
  'display': {'label': {'value': 'Racial Holy War: The Cold War',
    'language': 'en'},
   'description': {'value': '2016 first-person shooter video game',
    'language': 'en'}},
  'label': 'Racial Holy War: The Cold War',
  'description': '2016 first-person shooter video game',
  'match': {'typ

In [None]:
def get_named_entities(paragraph, spacy_model):
    doc = spacy_model(paragraph)
    entity_info = []
    for entity_text in set([entity.text for entity in doc.ents]):
        wikidata_info = get_wikidata_info(entity_text)
        for wikidata_item in wikidata_info:
            entity_info.append((wikidata_item["id"], wikidata_item["label"]))
            if "aliases" in wikidata_item:
                print(wikidata_item["label"], wikidata_item["aliases"])
                for label in wikidata_item["aliases"]:
                    entity_info.append((wikidata_item["id"], label))
    return sorted(set(entity_info), key=lambda x: x[0])

In [None]:
def extract_synsets(paragraph):
    tokens = word_tokenize(paragraph)
    tagged_tokens = pos_tag(tokens)
    relevant_synsets = set()
    lemmatizer = WordNetLemmatizer()
    for token, tag in tagged_tokens:
        if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
            lemma = lemmatizer.lemmatize(token)
            synsets = wn.synsets(lemma)
            if synsets:
                synset = synsets[0]
                relevant_synsets.add((synset.name(), synset.definition()))
    return sorted(list(relevant_synsets), key=lambda x: x[0])

In [None]:
ontox_dict = json.load(open('../../data/ontox_dict.json', 'r'))

In [None]:
linked_ontox_dict = {}
for file_name in chatgpt_texts:
    chatgpt_text_split = split_text(chatgpt_texts[file_name])
    meme_text = get_meme_text_from_chatgpt_text(chatgpt_text_split["textual"])
    linked_ontox_dict[file_name] = {
        "Image_URL": "unknown",
        "Meme_text": meme_text,
        "Visual_description": chatgpt_text_split["visual"],
        "Textual_description": chatgpt_text_split["textual"],
        "Combined_description": chatgpt_text_split["combined"],
        "extracted_synsets": {},
        "extracted_ne_qids": {}
    }
    for mode in modes:
        synsets = extract_synsets(chatgpt_text_split[mode])
        named_entities = get_named_entities(chatgpt_text_split[mode], spacy_model)
        linked_ontox_dict[file_name]["extracted_synsets"][mode] = [{"name": synset[0], "definition": synset[1]} for synset in synsets]
        linked_ontox_dict[file_name]["extracted_ne_qids"][mode] = [{"qid": entity[0], "name": entity[1]} for entity in named_entities]

In [None]:
outfile_handle = open("gpahe_process.json", "w")
json.dump(linked_ontox_dict, outfile_handle)
outfile_handle.close()

In [None]:
linked_ontox_dict["chatgpt_2b.txt"]