In [None]:
%reload_ext autoreload
%autoreload 2

## HyperParameters

### Paths


In [None]:
GATE_FOLDER = "Path_to_gate_folder"
TF_IDF_FOLDER = 'Path_to_termbank_tfidf_generated_from_gate'
FOLDER_WITH_TEXTS = 'Path_to_folder_with_texts_to_learn_ontology'
CSV_NON_HIERARCHICAL_RELATIONS = 'Path_to_save_non_hierarchical_relations'
HIERARCHY_IMAGE_NAME = 'Path_to_image_to_save_dendrogram'

### CONSTANTS

In [None]:
TF_IDF_THRESHOLD = 0
TOP_N_TERMS = 810
PRE_TRAINED_BERT_MODEL = 'bert-base-uncased'
CLUSTERING_METHOD='complete'
CLUSTERING_METRIC ='cosine'

## Initialization

### Libs

In [None]:
# Default
import json
import unicodedata

# AI and Math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import glob

# NLP
from gatenlp import Document
from gatenlp.processing.tokenizer import NLTKTokenizer
from nltk.tokenize import TreebankWordTokenizer
from gatenlp.gateworker import GateWorker
from gatenlp.processing.executor import SerialCorpusExecutor
from gatenlp.gateworker import GateWorkerAnnotator

import codecs
import csv
from functools import partial
import gatenlp
import sys
from tqdm import tqdm

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

In [None]:
# Check Versions
print("NB last updated with gatenlp version", gatenlp.__version__)
sys.path.append("/home/alencga1/anaconda3/lib/python3.9/site-packages/")
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [None]:
from nltk import tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet

In [None]:
# Setting up home variables
import os
os.environ["GATE_HOME"] = GATE_FOLDER

### Custom Libs

In [None]:
from lib.ontologyextraction.schema import Term, Concept, Concept_Taxonomy
from lib.ontologyextraction.termenrichment import termEnrichment
from lib.ontologyextraction.helpers import find_sub_list, getTokenVecs, getWordEmbeddingFromPhrase, count_clusters, create_tree

## Term Extraction


### Data Pre-processing


In [None]:
df = pd.read_csv(TF_IDF_FOLDER, sep=',', 
            quoting=csv.QUOTE_NONE, encoding='utf8')
# Delete duplicate rows based on specific columns 
df2 = df.drop_duplicates(subset=["Term"], keep='first')

In [None]:
threshold = TF_IDF_THRESHOLD
first_n = TOP_N_TERMS
# list_terms_string = list(set(df.loc[df['tfIdf'] > threshold]['Term'].iloc[:min(first_n,df.shape[0])].values))
list_terms_string = list(df2.loc[df2['tfIdf'] > threshold]['Term'].values)
list_terms_string = list_terms_string[:min(first_n,len(list_terms_string))]

In [None]:
list_of_terms = [Term(term_string, df2.loc[df2['Term'] == term_string]['tfIdf'].iloc[0]) for term_string in list_terms_string]

## Concept

### Concept Embedding

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_BERT_MODEL)

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained(PRE_TRAINED_BERT_MODEL)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

In [None]:
BERT_EMBEDDING_APP = partial(getWordEmbeddingFromPhrase, df, FOLDER_WITH_TEXTS, tokenizer,model, True, 5)

In [None]:
for term in list_of_terms:
    print(term.term_name)
    term._Apply_Embedding(BERT_EMBEDDING_APP)

In [None]:
# Find only words with actual vectors
X = np.array(list_of_terms[0].term_embedding.tolist())
labellist = []
filtered_list_of_terms = []
labellist.append(list_of_terms[0].term_name)
filtered_list_of_terms.append(list_of_terms[0])
for i in range(1, len(list_of_terms)):
    term = list_of_terms[i]
    if term.term_embedding is not None: 
        x = np.array(term.term_embedding.tolist())
        X = np.vstack((X,x))
        labellist.append(term.term_name)
        filtered_list_of_terms.append(term)
    else:
        print(term.term_name)

## Concept Hierarchy

### Hierachical Clustering

In [None]:
plt.figure(figsize=(100, 120))
Z = sch.linkage(X, method=CLUSTERING_METHOD, metric=CLUSTERING_METRIC)
dendrogram = sch.dendrogram(Z, labels=labellist)
plt.savefig(HIERARCHY_IMAGE_NAME)
plt.show()

In [None]:
count_clusters(dendrogram)

### Agglomerative Clustering Example

In [None]:
cluster = AgglomerativeClustering(n_clusters=count_clusters(dendrogram), metric=CLUSTERING_METRIC,linkage=CLUSTERING_METHOD)
cluster.fit(X)

In [None]:
for i in range(len(cluster.labels_)):
    for term in list_of_terms:
        if term.term_name == labellist[i]:
            term.cluster = cluster.labels_[i]

In [None]:
for j in range(0,count_clusters(dendrogram)):
    print('Cluster : ' + str(j))
    for i in range(len(list_of_terms)):
        if list_of_terms[i].cluster == j:
            print('Term : ' + list_of_terms[i].term_name)

In [None]:
for k in range(len(labellist)):
    if labellist[k] == 'address':
        print(k)

In order to execute an automatic parenthood assertion, rules must be defined to organaize the process of extraction of the relation "is_a":
 1. All terms are concepts
 2. Agglomerative clustering of concepts
 3. A cluster is a concept
 4. A cluster concept of concepts $x$ and $y$ is going to have as term:
    1. Either find common synonim
    2. Either find if they are hyponyms or hypernyms of each other
    3. Either check for similar head
    4. Either don't give a name and group hopping that in a higher cluster a new concept will be found
 5. As for stopping condition, either stop when the distance between clusters is past threshold or when it gets to the top

### Automatic Parenthood Assertion

#### Rules

#### Implementation

In [None]:
from lib.ontologyextraction.schema import Term, Concept, Concept_Taxonomy

In [None]:
for term in filtered_list_of_terms:
    term = termEnrichment(term)

In [None]:
# Concept Creation
list_of_concepts = []
for term in filtered_list_of_terms:
    concept = Concept(term.term_name,[term],descriptive_term=term)
    list_of_concepts.append(concept)

In [None]:
conceptTaxonomy = Concept_Taxonomy(list_of_concepts)
conceptTaxonomy.createTaxonomyFromDistanceMatrix(Z)

In [None]:
final_concepts = []
for k in set(conceptTaxonomy.concept_dict.values()):
    if(len(k.concept_name) == 0):
        continue
    if k not in final_concepts:
        for x in final_concepts:
            if x.concept_name == k.concept_name and x in k.children_concept:
                final_concepts.remove(x)
        final_concepts.append(k)
print('The number of relevant deducted concepts are : ', len(final_concepts))

In [None]:
highest_level = 0
for k in final_concepts:
    if k.level > highest_level:
        highest_level = k.level
print(highest_level)

In [None]:
for lv in range(highest_level):
    print('===========================================')
    print('===========================================')
    print('Level:' + str(lv))
    print('-------------------------------------------')
    for k in final_concepts:
        if k.level == lv:
            print(k.concept_name)

In [None]:
for concept in final_concepts:
    print('===========================================')
    print('===========================================')
    print('concept name :' + concept.concept_name)
    print('concept lv :' + str(concept.level))
    print('concept nb children :' + str(len(concept.children_concept)))
    print('-------------------------------------------')
    list_of_concept_names = list(set([i.concept_name for i in concept.children_concept]))
    for t in list_of_concept_names:
        if(len(t) > 0):
            print(t)

In [None]:
all_lv3_concepts = []
for k in final_concepts:
    if k.level <= 3:
        all_lv3_concepts.append(k)

In [None]:
for concept in all_lv3_concepts:
    print('===========================================')
    print('===========================================')
    print('concept name :' + concept.concept_name)
    print('concept lv :' + str(concept.level))
    print('-------------------------------------------')
    list_of_concept_names = list(set([i.concept_name for i in concept.children_concept]))
    print(len(concept.children_concept))
    for t in list_of_concept_names:
        if(len(t) > 0):
            print(t)

### Final Hierarchy

In [None]:
def checkkid(concept1, concept2):
    if concept2.children_concept is None or len(concept2.children_concept) == 0:
        return False
    elif concept1 in concept2.children_concept:
        return True
    else:
        for concept in concept2.children_concept:
            return checkkid(concept1, concept)

def append_children(concept):
    if concept.children_concept is None or len(concept.children_concept) == 0:
        return [concept]
    else:
        list_of_concepts = []
        for concept1 in concept.children_concept:
            list_of_concepts += append_children(concept1)
        list_of_concepts.append(concept)
        return list_of_concepts

In [None]:
concept_hierachy = []
list_remove_later = []
for k in set(conceptTaxonomy.deducted_concepts):
    if(len(k.concept_name) == 0):
        continue
    already_in = False
    for x in concept_hierachy:
        if checkkid(x,k):
            list_remove_later.append(x)
        elif checkkid(k,x):
            already_in=True
    if not already_in and k not in concept_hierachy:
        concept_hierachy.append(k)

for k in list_remove_later:
    if k in concept_hierachy:
        concept_hierachy.remove(k)
print('The number high level concepts are : ', len(concept_hierachy))

In [None]:
def print_hierarchy(concept):
    strf = ""
    def print_sub_hierarchy(concept1, strf):
        print('-lv: '+ str(concept1.level) + ', name : ' + concept1.concept_name)
        for c in concept1.children_concept:
            tmp = print_sub_hierarchy(c, strf)
            if tmp is not None:
                print(tmp)
        print('------------------------------------------')
    strf += '===========================================' + '\n'
    strf += '===========================================' + '\n'
    strf += 'concept name :' + concept.concept_name + '\n'
    strf += 'concept lv :' + str(concept.level) + '\n'
    strf += '==========================================='
    print(strf)
    for conceptc in concept.children_concept:
        print_sub_hierarchy(conceptc, strf)

In [None]:
count = 0
strf = ""
for concept in concept_hierachy:
    count += len(concept.children_concept)
    if(len(concept.concept_name) > 0):
        print_hierarchy(concept)

## Non-taxonomic Relation Extraction

In [None]:
corpus = list(set(df['documentURL'].dropna().tolist()))
X_freq = np.zeros((len(corpus), len(labellist)))
word_list_dict = dict(zip(labellist,range(len(labellist))))
doc_dict = dict(zip(corpus,range(len(corpus))))

In [None]:
for label in labellist:
    list_of_docs = df.loc[df['Term']==label]['documentURL'].dropna().tolist()
    for doc in list_of_docs:
        X_freq[doc_dict[doc], word_list_dict[label]] = 1

In [None]:
df_corpus = pd.DataFrame(np.clip(X_freq,0,1), columns=labellist)

In [None]:
df_corpus

In [None]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df_corpus, min_support = 0.5, max_len=2, use_colnames = True)

In [None]:
frequent_itemsets

In [None]:
from mlxtend.frequent_patterns import association_rules

relations = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
relations = relations[['antecedents','consequents']]
display(relations)

In [None]:
relations.to_csv(CSV_NON_HIERARCHICAL_RELATIONS)  