In [12]:
import os
import pandas as pd
from embedding import embeddings

main_publisher = 'OYC'
methods = ['BERT']
class_type = 'rdfs' #dct or rdfs
class_file = class_type + ".csv"

script_dir = os.path.dirname(os.path.realpath('__file__'))
data_path = os.path.join(script_dir, '../Data/' + main_publisher + '/data/')
embeddings_path = os.path.join(script_dir, '../Data/' + main_publisher + '/embeddings/')
df_chapters = pd.read_csv(data_path + 'chapters.csv', sep = '|')
df_chapters['Title'] = df_chapters['Title'].astype(str)
df_chapters['Title'] = df_chapters['Title'].apply(lambda x: ' ' if x=='nan' else x)

df_chapters['Text'] = df_chapters['Text'].astype(str)
df_chapters['Text'] = df_chapters['Text'].apply(lambda x: ' ' if x=='nan' else x)

df_chapters['Type'] = df_chapters['Text'].apply(lambda x: str(type(x)))
#df_chapters[df_chapters['Text'] == ' '].head()


In [13]:
# For both BERT & EMBEDD-ER
df_chapters_embeddings = embeddings(df_chapters.Text.values, df_chapters.Cid.values, methods, main_publisher, save = False)
df_chapters_embeddings.head()

Unnamed: 0,BERT
0,"[0.052457463, 0.47599125, 0.05521268, -0.28553..."
1,"[-0.012200231, 0.24475642, 0.11507651, -0.2445..."
2,"[0.0950356, 0.22532578, 0.17159453, -0.0779241..."
3,"[0.03151487, 0.44307682, 0.12105083, -0.171348..."
4,"[-0.03276513, 0.1495071, 0.1283632, -0.0709260..."


In [14]:
df_chapters_embeddings.to_csv(embeddings_path + "chapters_" + methods[0] + ".csv", sep = '|')

In [1]:
from embedding import load_model, wikipedia2vec_embedding

wikipedia2vec = load_model("../Models/enwiki_20180420_300d.txt", 300)

  from .autonotebook import tqdm as notebook_tqdm


Loading 300 model ...
Loaded 300 model ...


In [88]:
if methods[0] == 'embedd-er':
    print('Using EMBEDD-ER to embed concepts...')
    df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
    df_concepts_embeddings = pd.DataFrame()
    df_concepts_embeddings['Concepts Embedding'] = df_concepts['Concept'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x.split('/')[-1], 300))
    #df_concepts['Concepts Embedding'] = df_concepts_embeddings
    df_concepts_embeddings.to_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|')

df_concepts_embeddings.head()

Using EMBEDD-ER to embed concepts...


Unnamed: 0,Concepts Embedding
0,"[0.0484, 0.3576, -0.9482, -0.7079, -0.1311, 0...."
1,"[-0.3457, -0.0463, -0.5313, -0.0684, -0.4088, ..."
2,"[-0.1308, 0.36, -0.9123, -0.1759, -1.0825, -0...."
3,"[-0.4643, -0.0708, 0.1259, 0.3538, -0.9151, -0..."
4,"[-0.6656, -0.5116, -0.6724, 0.3429, -0.2628, 0..."


In [95]:
if methods[0] == 'BERT':
    print('Using BERT to embed concepts...')
    df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
    df_concepts['Concept'] = df_concepts['Concept'].apply(lambda x : x.split('/')[-1].replace('_', ' '))
    df_concepts_embeddings = pd.DataFrame()
    df_concepts_embeddings['Concepts Embedding'] = embeddings(df_concepts.Concept.values, df_concepts.Concept.values, methods, main_publisher, save = False)
    df_concepts_embeddings.to_csv(embeddings_path + "concepts_" + methods[0] + ".csv", sep = '|')
    df_concepts_embeddings.head()


Using BERT to embed concepts...


In [99]:
# Reading the concepts and classes files
df_concepts = pd.read_csv(data_path + 'concepts.csv', sep = '|')
df_concepts_classes = pd.read_csv(data_path + 'classes/' + class_file, sep = '|', index_col=0)
df_concepts_classes.head()

Unnamed: 0,Concept,Class
0,Breast_cancer,http://www.w3.org/2002/07/owl#Thing
1,Breast_cancer,http://dbpedia.org/ontology/Disease
2,Gene,http://www.w3.org/2002/07/owl#Thing
3,Gene,http://dbpedia.org/ontology/Organisation
4,Gene_expression,http://www.w3.org/2002/07/owl#Thing


In [36]:
import re
def camel_to_snake(word):
    # Use a regular expression to find uppercase letters and insert an underscore before them
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', word)
    # Use a second regular expression to handle the case where there are multiple uppercase letters in a row
    snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return snake_case.capitalize()

def split_camel_case(s):
    # Add spaces before capital letters
    s = re.sub('([a-z])([A-Z])', r'\1 \2', s)
    # Handle acronyms (consecutive capital letters followed by a lowercase letter)
    s = re.sub('([A-Z]+)([A-Z][a-z])', r'\1 \2', s)
    return s

def split_snake_case(s):
    return s.replace('_', ' ')

In [93]:
if methods[0] == 'embedd-er':
    if class_type == 'rdfs':
        print('Using EMBEDD-ER to embed RDFS classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : camel_to_snake(x.split('/')[-1].split('#')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Embedding'] = df_concepts_classes['Class Name'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x, 300))
    elif class_type == 'dct':
        print('Using EMBEDD-ER to embed DCT classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : x.split('/')[-1].split(':')[-1])
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Embedding'] = df_concepts_classes['Class Name'].apply(lambda x : wikipedia2vec_embedding(wikipedia2vec, x, 300))
    df_classes_embeddings.to_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|', index = True)
    df_classes_embeddings.head()

Using EMBEDD-ER to embed DCT classes...


In [100]:
if methods[0] == 'BERT':
    if class_type == 'rdfs':
        print('Using BERT to embed RDFS classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : split_camel_case(x.split('/')[-1].split('#')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Embedding'] = embeddings(df_concepts_classes['Class Name'].values, df_concepts_classes['Class Name'].values, methods, main_publisher, save = False)
    elif class_type == 'dct':
        print('Using BERT to embed DCT classes...')
        df_concepts_classes['Class Name'] = df_concepts_classes['Class'].apply(lambda x : split_snake_case(x.split('/')[-1].split(':')[-1]))
        df_classes_embeddings = pd.DataFrame()
        df_classes_embeddings['Embedding'] = embeddings(df_concepts_classes['Class Name'].values, df_concepts_classes['Class Name'].values, methods, main_publisher, save = False)
    df_classes_embeddings.to_csv(embeddings_path + "classes/"+ class_type + '_' + methods[0] + ".csv", sep = '|', index = True)
    df_classes_embeddings.head()

Using BERT to embed RDFS classes...
