In [66]:
import os
import pandas as pd
#from embedding import embeddings

main_publisher = 'OYC'
methods = ['embedd-er']
class_type = 'dct' #dct or rdfs
class_file = class_type + ".csv"

script_dir = os.path.dirname(os.path.realpath('__file__'))
data_path = os.path.join(script_dir, '../Data/' + main_publisher + '/data/')
embeddings_path = os.path.join(script_dir, '../Data/' + main_publisher + '/embeddings/')
df_chapters = pd.read_csv(data_path + 'chapters.csv', sep = '|')
df_chapters['Title'] = df_chapters['Title'].astype(str)
df_chapters['Title'] = df_chapters['Title'].apply(lambda x: ' ' if x=='nan' else x)

df_chapters['Text'] = df_chapters['Text'].astype(str)
df_chapters['Text'] = df_chapters['Text'].apply(lambda x: ' ' if x=='nan' else x)

df_chapters['Type'] = df_chapters['Text'].apply(lambda x: str(type(x)))
#df_chapters[df_chapters['Text'] == ' '].head()


In [3]:
from embedding import load_model, wikipedia2vec_embedding

wikipedia2vec = load_model("../Models/enwiki_20180420_300d.txt", 300)

  from .autonotebook import tqdm as notebook_tqdm


Loading 300 model ...
Loaded 300 model ...


In [47]:
import pandas as pd
from simpletransformers.language_representation import RepresentationModel
from gensim.models import KeyedVectors
from gensim.models import FastText
from gensim.test.utils import common_texts
import numpy as np
import rdflib
import os

def symbols_filter(word):
    word = word.replace("%e2%80%93","-")
    word = word.replace("%27", "'")
    word = word.replace("%2f", "/")
    word = word.replace("%e2%80%94", "-")
    word = word.replace("%2e", ".")
    word = word.replace("%26", '&')
    return word


def load_model(path, name = 'wikipedia2vec'):
    # dimensions = ["100", "300", "500"]
    print(f"Loading {name} model ...")
    wikipedia2vec = KeyedVectors.load_word2vec_format(path, binary = False)
    print(f"Loaded {name} model ...")
    return wikipedia2vec


def wikipedia2vec_embedding(model, concept, d):
    try :
        return model['ENTITY/'+concept]
    except KeyError:
        return np.zeros((d))
        
        
def node_embeddings(model, g, d, method = 'wikipedia2vec'):
    embeddings_concepts = {}
    concepts = []
    missing_concepts = []
    if method == 'wikipedia2vec':
        for s, p, o in g:
            concept = symbols_filter(s.split('/')[-1])
            if str(p) == 'https://univ-nantes.fr/ontology/pageRank' :
                embedding = wikipedia2vec_embedding(model, concept, d)
                if len(embedding) != 0:
                    embeddings_concepts[concept] = {}
                    embeddings_concepts[concept]['list'] = embedding
                    embeddings_concepts[concept]['pageRank'] = float(o)
                    concepts.append(concept)
                else :
                    missing_concepts.append(concept)

    return {
        'embeddings' : embeddings_concepts,
        'concepts' : concepts,
        'missing_concepts' : missing_concepts
    }
    

def embedder_embeddings(resources, publisher, model = None, d = 300):
    if model is None:
        path = "../Models/enwiki_20180420_"+str(d)+"d.txt"
        model = load_model(path, d)
    script_dir = os.path.dirname(os.path.realpath('__file__'))
    concepts = []
    missing_concepts = []
    sentences_embedder = []
    for c in resources:
        embeddings_concepts = []
        current_concepts = []
        
        g = rdflib.Graph()
        g_path = os.path.join(script_dir, '../Output/Graphs/v01/' + publisher + '/' + str(c) + '.ttl')
        try :
            g.parse(g_path, format='turtle')

            embeddings = node_embeddings(model, g, d, method='wikipedia2vec')

            embeddings_concepts = embeddings['embeddings']
            missing_concepts.append(embeddings['missing_concepts'])
            current_concepts = embeddings['concepts']
            concepts.append(current_concepts)
            pageRankSum = np.sum([embeddings_concepts[k]['pageRank'] for k in embeddings_concepts], 0)
            embeddingsSum = np.sum([np.dot(embeddings_concepts[k]['list'], embeddings_concepts[k]['pageRank']) for k in embeddings_concepts], 0)
            sentences_embedder.append(embeddingsSum / pageRankSum)
        except:
            embeddingsSum = np.zeros((1,d))
            pageRankSum = np.ones((1,1))
            print(c, "not found")
            sentences_embedder.append((embeddingsSum / pageRankSum)[0])

    return sentences_embedder

def bert_embeddings(sentences):
    model_bert = RepresentationModel(
        model_type = "bert",
        model_name = "bert-base-uncased",
        use_cuda = False
    )
    sentences_vector_bert = list(model_bert.encode_sentences(sentences, combine_strategy = "mean"))
    
    return sentences_vector_bert

def fasttext_embeddings(sentences, n_gram, window, size, epochs):
    fasttext = FastText(vector_size = size, window = window, min_count = 1, min_n = n_gram, sentences = common_texts, epochs = 10)
    sentences_vector_fasttext = list(fasttext.wv[sentences])
    return sentences_vector_fasttext

def embeddings(sentences, resources, methods, publisher, model = None, save = False, path = ''):

    embeddings_df = pd.DataFrame()

    if 'BERT' in methods:
        embeddings_df['BERT'] = bert_embeddings(sentences)
    if 'embedd-er' in methods:
        embeddings_df['EMBEDD-ER'] = embedder_embeddings(resources, publisher, model, d = 300)
    if 'FastText' in methods:
        embeddings_df['FastText'] = fasttext_embeddings(sentences, n_gram = 3, window = 5, size = 300, epochs = 10)

    if save:
        embeddings_df.to_csv(path+"embeddings.csv", sep = '|')

    return embeddings_df


In [17]:
df_chapters.shape

(1039, 9)

In [59]:
# For both BERT & EMBEDD-ER
df_chapters_embeddings = pd.DataFrame()
df_chapters_embeddings["Chapters Embeddings"] = embeddings(df_chapters.Text.values, df_chapters.Cid.values, methods, main_publisher, wikipedia2vec, save = False)

In [61]:
df_chapters_embeddings.to_csv(embeddings_path + "chapters_" + methods[0] + ".csv", sep = '|')

In [62]:
if methods[0] == 'embedd-er':
    print('Using EMBEDD-ER to embed concepts...')
    df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
    df_concepts_embeddings = pd.DataFrame()
    df_concepts_embeddings['Concepts Embeddings'] = df_concepts['Concept'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x.split('/')[-1], 300))
    #df_concepts['Concepts Embedding'] = df_concepts_embeddings
    df_concepts_embeddings.to_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|')

df_concepts_embeddings.head()

Using EMBEDD-ER to embed concepts...


Unnamed: 0,Concepts Embeddings
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.0537, -0.4817, -0.0068, -0.0026, 0.1848, 0...."
2,"[-0.8953, -0.0139, -0.1495, 0.4391, 0.0186, 0...."
3,"[-0.2859, -0.7587, -1.1472, 1.0824, 0.1735, -0..."
4,"[-0.0052, -0.1101, 0.3223, 0.8715, 0.5889, -0...."


In [39]:
if methods[0] == 'BERT':
    print('Using BERT to embed concepts...')
    df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
    df_concepts['Concept'] = df_concepts['Concept'].apply(lambda x : x.split('/')[-1].replace('_', ' '))
    df_concepts_embeddings = pd.DataFrame()
    df_concepts_embeddings['Concepts Embeddings'] = embeddings(df_concepts.Concept.values, df_concepts.Concept.values, methods, main_publisher, save = False)
    df_concepts_embeddings.to_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|')
    df_concepts_embeddings.head()


Using BERT to embed concepts...


In [67]:
# Reading the concepts and classes files
df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
df_concepts_classes = pd.read_csv(data_path + 'classes/' + class_file, sep = '|', index_col=0)
df_concepts_classes.head()

Unnamed: 0,Concept,Class
0,Race_and_ethnicity_in_the_United_States_census,http://dbpedia.org/resource/Category:Race_in_t...
1,Race_and_ethnicity_in_the_United_States_census,http://dbpedia.org/resource/Category:United_St...
2,Race_and_ethnicity_in_the_United_States_census,http://dbpedia.org/resource/Category:Demograph...
3,Race_and_ethnicity_in_the_United_States_census,http://dbpedia.org/resource/Category:Ethnic_gr...
4,African_Americans,http://dbpedia.org/resource/Category:History_o...


In [64]:
import re
def camel_to_snake(word):
    # Use a regular expression to find uppercase letters and insert an underscore before them
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', word)
    # Use a second regular expression to handle the case where there are multiple uppercase letters in a row
    snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return snake_case.capitalize()

def split_camel_case(s):
    # Add spaces before capital letters
    s = re.sub('([a-z])([A-Z])', r'\1 \2', s)
    # Handle acronyms (consecutive capital letters followed by a lowercase letter)
    s = re.sub('([A-Z]+)([A-Z][a-z])', r'\1 \2', s)
    return s

def split_snake_case(s):
    return s.replace('_', ' ')

In [68]:
if methods[0] == 'embedd-er':
    if class_type == 'rdfs':
        print('Using EMBEDD-ER to embed RDFS classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : camel_to_snake(x.split('/')[-1].split('#')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Classes Embeddings'] = df_concepts_classes['Class Name'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x, 300))
    elif class_type == 'dct':
        print('Using EMBEDD-ER to embed DCT classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : x.split('/')[-1].split(':')[-1])
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Classes Embeddings'] = df_concepts_classes['Class Name'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x, 300))
    df_classes_embeddings.to_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|')
    df_classes_embeddings.head()

Using EMBEDD-ER to embed DCT classes...


In [45]:
if methods[0] == 'BERT':
    if class_type == 'rdfs':
        print('Using BERT to embed RDFS classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : split_camel_case(x.split('/')[-1].split('#')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Classes Embeddings'] = embeddings(df_concepts_classes['Class Name'].values, df_concepts_classes['Class Name'].values, methods, main_publisher, save = False)
    elif class_type == 'dct':
        print('Using BERT to embed DCT classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : split_snake_case(x.split('/')[-1].split(':')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Classes Embeddings'] = embeddings(df_concepts_classes['Class Name'].values, df_concepts_classes['Class Name'].values, methods, main_publisher, save = False)
    df_classes_embeddings.to_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|')
    df_classes_embeddings.head()

Using BERT to embed RDFS classes...


In [6]:
df_concepts_classes.head(20)

Unnamed: 0,Concept,Class
0,Antibiotic,http://dbpedia.org/resource/Category:Anti-infe...
1,Antibiotic,http://dbpedia.org/resource/Category:Antibiotics
2,Antibiotic,http://dbpedia.org/resource/Category:Bactericides
3,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Veterinar...
4,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Global_is...
5,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Health_di...
6,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Pharmaceu...
7,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Evolution...
8,Antimicrobial_resistance,http://dbpedia.org/resource/Category:Antimicro...
9,Ape,http://dbpedia.org/resource/Category:Apes


In [175]:
main_publisher = "Stanford"
methods = ['embedd-er']

embeddings_path = os.path.join(script_dir, '../Data/' + main_publisher + '/embeddings/')

In [None]:
df_chapters_embeddings = pd.read_csv(embeddings_path + "chapters_" + methods[0] + ".csv", sep = '|', index_col=0)
df_chapters_embeddings.head()

In [None]:
df_chapters_embeddings = df_chapters_embeddings.rename(columns={'EMBEDD-ER' : 'Chapters Embeddings'})
df_chapters_embeddings.head()

In [105]:
df_chapters_embeddings.to_csv(embeddings_path + "chapters_" + methods[0] + ".csv", sep = '|')

In [None]:
df_concepts_embeddings = pd.read_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|', index_col=0)
df_concepts_embeddings.head()

In [None]:
df_concepts_embeddings = df_concepts_embeddings.rename(columns={'Concepts Embedding' : 'Concepts Embeddings'})
df_concepts_embeddings.head()

In [132]:
df_concepts_embeddings.to_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|')

In [179]:
class_type = 'rdfs' #dct or rdfs
class_file = class_type + ".csv"
df_classes_embeddings = pd.read_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|', index_col=0)
df_classes_embeddings.head()

Unnamed: 0,Embedding
0,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
1,[-0.0893 -0.2376 -0.5768 -0.2991 -0.4358 -0.88...
2,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
4,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....


In [180]:
df_classes_embeddings = df_classes_embeddings.rename(columns={'Embedding' : 'Classes Embeddings'})
df_classes_embeddings.head()

Unnamed: 0,Classes Embeddings
0,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
1,[-0.0893 -0.2376 -0.5768 -0.2991 -0.4358 -0.88...
2,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
3,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....
4,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....


In [181]:
df_classes_embeddings.to_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|')
