# Import Modules

In [1]:
import pandas as pd, numpy as np
from datasets import load_dataset
import re
import spacy
import en_core_web_sm
import string
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import gensim.downloader as api
import nltk
from nltk.corpus import wordnet
# from nltk.corpus import wordnet as wn
import wordninja
from textblob import Word
from sent2vec.vectorizer import Vectorizer
# from keras.models import Sequential
# from keras.layers import Dense, Concatenate, Flatten
# from keras.utils import to_categorical



# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')



nlp = spacy.load('en_core_web_sm')
word2vec_model = api.load("word2vec-google-news-300")

# !python -m spacy download en_core_web_md


# Extract Entities

In [2]:
def extract_entities(sentence):
    # Entity Tag
    e1 = re.search(r'<e1>(.*?)</e1>', sentence).group(1).lower()
    e2 = re.search(r'<e2>(.*?)</e2>', sentence).group(1).lower()
    
    if e1 == 'devision':
        e1 = 'division'
    elif e1 == 'offfender':
        e1 = 'offender'
        
    if e2 == 'devision':
        e2 = 'division'
    elif e2 == 'offfender':
        e2 = 'offender'
        
    sentence = sentence.replace('>devision<', '>division<').replace('offfender','offender').lower()
        
    return pd.Series([e1, e2, sentence], index=['e1', 'e2', 'sentence'])

In [142]:
data = load_dataset("sem_eval_2010_task_8", split = "train")
train_df = pd.DataFrame(data)
train_df[['e1', 'e2','sentence']] = train_df['sentence'].apply(extract_entities)

for i in train_df.to_dict('records'):
    print (i)
    break

Found cached dataset sem_eval_2010_task_8 (/Users/rahulsen/.cache/huggingface/datasets/sem_eval_2010_task_8/default/1.0.0/8545d1995bbbade386acf5c4e2bef5589d8387ae0a93356407dfb54cdb234416)


{'sentence': 'the system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.', 'relation': 3, 'e1': 'configuration', 'e2': 'elements'}


In [156]:
from spellchecker import SpellChecker
spell_checker = SpellChecker(distance=1)
count = 0

def text_to_vectors(text,sentence):
    global count
    
    modified_punctuation_set = ''.join(char for char in string.punctuation)
    sentence = re.sub(r'\s([' + re.escape(modified_punctuation_set) + '])', r'\1', sentence)
    
    if text in word2vec_model:
        return pd.Series([word2vec_model[text],0, sentence, text], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
    
    elif text.replace('-','') in word2vec_model:
        return pd.Series([word2vec_model[text.replace('-','')],0, sentence.replace(text, text.replace('-','')), text.replace('-','')], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
    
    elif text.replace('-',' ').split()[-1] in word2vec_model:
        return pd.Series([word2vec_model[text.replace('-',' ').split()[-1]],0, sentence.replace(text, text.replace('-',' ')), text.replace('-',' ')], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
    
    elif text.replace('-',' ').split()[0] in word2vec_model:
        return pd.Series([word2vec_model[text.replace('-',' ').split()[0]],0, sentence.replace(text, text.replace('-',' ')), text.replace('-',' ')], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
    
    elif spell_checker.correction(text) in word2vec_model:
        return pd.Series([word2vec_model[spell_checker.correction(text)],0, sentence.replace(text, spell_checker.correction(text)), spell_checker.correction(text)], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
    
#     add another elif to break some words into 2 segments
    else:
        count = count+1
        return pd.Series([np.zeros(word2vec_model.vector_size),1, sentence, text], index=['entity_vectors', 'flag', 'corrected_sentence', 'text'])
        

train_df['corrected_sentence'] = train_df['sentence'].str.replace('<e1>',' ').str.replace('</e1>',' ').str.replace('<e2>',' ').str.replace('</e2>',' ').str.replace('   ', '  ').str.replace('  ', ' ').str.strip()
train_df[['entity1_vectors','flag','corrected_sentence','corrected_e1']] = train_df[['e1','corrected_sentence']].apply(lambda x: text_to_vectors(x['e1'],x['corrected_sentence']),axis=1)
print (count)
train_df[['entity2_vectors','flag','corrected_sentence','corrected_e2']] = train_df[['e2','corrected_sentence']].apply(lambda x: text_to_vectors(x['e2'],x['corrected_sentence']),axis=1)
train_df['corrected_sentence'] = train_df['corrected_sentence'].str.replace('-',' ').str.replace('   ', '  ').str.replace('  ', ' ').str.strip()

print (count)


78
123


In [157]:
train_df['entity_diff_vector'] = train_df['entity2_vectors']-train_df['entity1_vectors']

In [158]:
def get_synsets(word):
    words = word.replace('-',' ').split(' ')
    new_words = []
    for i in words:
        new_words = new_words + wordninja.split(nlp(i)[0].lemma_)
    synsets = []
    for item in new_words:
        item_synset = wordnet.synsets(item)
        final_item_synset = []
        for synset in item_synset:
            if synset.name().split('.')[1]=='n':
                final_item_synset.append(synset)
        if len(synsets)>0 and len(final_item_synset)==0:
            continue
        synsets.append(final_item_synset)
    return synsets[-1]

def get_word_pos(word):
    pos_tag = nltk.pos_tag([word])[0][1]
    if pos_tag.startswith('N'):  # Noun
        return 'n'
    elif pos_tag.startswith('V'):  # Verb
        return 'v'
    elif pos_tag.startswith('J'):  # Adjective
        return 'a'
    else:
        return None

def filter_words_by_pos(sentence):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)
    # Filter tokens by POS (noun, verb, adjective)
    filtered_tokens = [token for token in tokens if get_word_pos(token) is not None]
    return filtered_tokens

def disambiguate_word_in_sentence(sentence, word):

    # Tokenize the sentence
    tokens = filter_words_by_pos(sentence.replace(word,''))
    
    # Get synsets for the word
    synsets = wordnet.synsets(word.replace(' ','_'))
    if len(synsets)==0:
        synsets = get_synsets(word)
        
    # Initialize a dictionary to store scores for each synset
    scores = {}
    if len(synsets)==0:
        print (word)
    
    # Calculate scores for each synset based on word embeddings similarity
    for synset in synsets:
        gloss = synset.definition()
        gloss_tokens = filter_words_by_pos(gloss)

        # Calculate average embedding for gloss tokens
        gloss_embeddings = [word2vec_model[token] for token in gloss_tokens if token in word2vec_model]
        avg_gloss_embedding = np.mean(gloss_embeddings, axis=0)
        # Calculate similarity score based on cosine similarity between avg_gloss_embedding and each token in sentence
        similarity_scores = [np.dot(avg_gloss_embedding, word2vec_model[token]) /
                             (np.linalg.norm(avg_gloss_embedding) * np.linalg.norm(word2vec_model[token]))
                             for token in tokens if token in word2vec_model]
        scores[synset] = np.mean(similarity_scores)
    
    # Get the synset with the highest score
    if len(synsets)>0:
        best_synset = max(scores, key=scores.get)
    else:
        best_synset = wordnet.synsets('unavailable')[0]
    
    return best_synset

def get_hypernym(synset1, synset2):
    common_hypernym = synset1.lowest_common_hypernyms(synset2)
    return common_hypernym[0].lemmas()[0].name() if common_hypernym else None

def increase_flag(e1_synset,e2_synset,flag):
    if e1_synset.name().split('.')[0]=='unavailable':
        flag = flag+1
    if e2_synset.name().split('.')[0]=='unavailable':
        flag = flag+1
    return flag

train_df['e1_synset'] = np.vectorize(disambiguate_word_in_sentence)(train_df['corrected_sentence'],train_df['corrected_e1'])
train_df['e2_synset'] = np.vectorize(disambiguate_word_in_sentence)(train_df['corrected_sentence'],train_df['corrected_e2'])
train_df['e1_supersense'] = train_df.apply(lambda x: x['e1_synset'].lexname().split('.')[-1],axis=1)
train_df['e2_supersense'] = train_df.apply(lambda x: x['e2_synset'].lexname().split('.')[-1],axis=1)
train_df['hypernym'] = np.vectorize(get_hypernym)(train_df['e1_synset'],train_df['e2_synset'])
train_df['e1_definition'] = train_df.apply(lambda x: x['e1_synset'].definition(),axis=1)
train_df['e2_definition'] = train_df.apply(lambda x: x['e2_synset'].definition(),axis=1)

train_df



councilor
indenter
nye
helicobacter
litho
spinoff


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


piezos
recognizer
chai
muscularis
floodwaters
mage
santur
gamers
natron
installer
audiophile
opioids
acetonitrile
azeotrope
app
eyrar
joey
toolkits
orienteering
biodiesel
humidifier
motherboard
login
shockwaves
entrepreneurship
supercell
bytecodes
slideshows
blogosphere
stovetop
toolkit
katanas
malware
townhouse
automotives
maireeners
papillomavirus
photomultiplier
cementation
apps
microcontroller
quilters
mesophyll
worldview
firefighting
folksingers
groundwater
dialysate
groundwater
geodesist
sparrowhawks
biochemicals
masterplan
iphone
ceasefire
politicization
anisotropic etching
skeletal remains
trapdoors
stylesheet


Unnamed: 0,sentence,relation,e1,e2,corrected_sentence,entity1_vectors,flag,corrected_e1,entity2_vectors,corrected_e2,...,e2_post_token,e2_start,e2_end,e2_position,num_verbs,num_nouns,num_prep,num_adj,num_words_btwn,bert_sentence
0,the system as described above has its greatest...,3,configuration,elements,the system as described above has its greatest...,"[0.13964844, 0.07373047, -0.037841797, 0.10302...",0,configuration,"[0.10888672, 0.1796875, 0.107910156, 0.0198974...",elements,...,.,98,106,16,0,2,1,0,3,the system as described above has its greatest...
1,the <e1>child</e1> was carefully wrapped and b...,18,child,cradle,the child was carefully wrapped and bound into...,"[0.16503906, -0.063964844, -0.0017852783, 0.18...",0,child,"[0.15917969, 0.076171875, 0.01953125, 0.21875,...",cradle,...,by,51,57,10,2,1,1,0,8,the child was carefully wrapped and bound into...
2,the <e1>author</e1> of a keygen uses a <e2>dis...,11,author,disassembler,the author of a keygen uses a disassembler to ...,"[0.12988281, -0.140625, 0.041748047, 0.0927734...",0,author,"[0.15332031, 0.095703125, 0.010925293, 0.05639...",disassembler,...,to,30,42,8,1,2,1,0,6,the author of a keygen uses a disassembler to ...
3,a misty <e1>ridge</e1> uprises from the <e2>su...,18,ridge,surge,a misty ridge uprises from the surge.,"[-0.09472656, -0.059814453, -0.203125, 0.06298...",0,ridge,"[0.104003906, 0.13574219, -0.10644531, -0.0260...",surge,...,.,31,36,7,0,2,1,0,4,a misty ridge uprises from the surge.
4,the <e1>student</e1> <e2>association</e2> is t...,12,student,association,the student association is the voice of the un...,"[0.036865234, 0.020141602, 0.22167969, 0.15527...",0,student,"[-0.20703125, -0.28710938, 0.03564453, -0.0859...",association,...,is,12,23,3,0,4,3,0,13,the student association is the voice of the un...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,when the <e1>notice</e1> is sent by <e2>fax</e...,18,notice,fax,"when the notice is sent by fax, the notice is ...","[-0.19824219, 0.08984375, 0.20898438, -0.16699...",0,notice,"[-0.26171875, -0.039794922, 0.12890625, -0.097...",fax,...,",",27,30,7,4,3,3,0,16,"when the notice is sent by fax, the notice is ..."
7996,the <e1>herbicide</e1> is derived from a natur...,8,herbicide,antibiotic,the herbicide is derived from a natural antibi...,"[0.20019531, 0.29492188, -0.115234375, 0.26953...",0,herbicide,"[-0.14550781, 0.20019531, 0.07861328, -0.03955...",antibiotic,...,",",40,50,8,1,1,1,1,6,the herbicide is derived from a natural antibi...
7997,"to test this, we placed a kitchen <e1>match</e...",6,match,jar,"to test this, we placed a kitchen match in the...","[-0.15527344, 0.025024414, 0.064941406, -0.124...",0,match,"[0.11621094, 0.11621094, 0.043945312, 0.267578...",jar,...,.,47,50,12,2,3,2,0,12,"to test this, we placed a kitchen match in the..."
7998,the farmers and city officials in the region h...,18,farmers,market,the farmers and city officials in the region h...,"[0.17578125, 0.07470703, -0.24707031, 0.233398...",0,farmers,"[-0.15625, -0.087890625, -0.22949219, -0.23144...",market,...,into,95,101,19,3,11,2,1,26,the farmers and city officials in the region h...


In [148]:
def split_word(word):
    for i in range(1, len(word)):
        prefix = word[:i]
        suffix = word[i:]
        if prefix in word2vec_model and suffix in word2vec_model:
            return prefix, suffix
    return None, None

word = "biochemicals"
prefix, suffix = split_word(word)
if prefix and suffix:
    print(f"Splitting '{word}' into '{prefix}' and '{suffix}'")
else:
    print(f"Cannot split '{word}' into two meaningful words.")

Splitting 'biochemicals' into 'bio' and 'chemicals'


In [159]:
train_df['flag'] = np.vectorize(increase_flag)(train_df['e1_synset'],train_df['e2_synset'],train_df['flag'])

In [160]:
def extract_features( e1, e2, old_sentence):

    old_sentence = old_sentence.replace('<e1>','').replace('</e1>','').replace('<e2>','').replace('</e2>','')
    old_doc = nlp(old_sentence)

    e1_pos, e2_pos = None, None
    e1_dep_token, e2_dep_token = 'NA.', 'NA.'
    e1_prev_token, e2_prev_token = 'NA.', 'NA.'
    e1_post_token, e2_post_token = 'NA.', 'NA.'
    
    e1_dep_noun, e1_dep_adj, e1_dep_verb, e1_dep_prep, e1_dep_subj, e1_dep_obj = 0, 0, 0, 0, 0, 0
    e2_dep_noun, e2_dep_adj, e2_dep_verb, e2_dep_prep, e2_dep_subj, e2_dep_obj = 0, 0, 0, 0, 0, 0
    num_verbs, num_nouns, num_prep, num_adj, num_words_btwn = 0, 0, 0, 0, 0 
    start_inbetween_count = False
    
    word_count = 0
    e1_position_set = e2_position_set = False
    e1_post_memory = e2_post_memory = False
    memory = 'NA.'
    e1_position = e2_position = -1
    for token in old_doc:
        word_count = word_count + 1
        
        if e1_post_memory == True:
            e1_post_token = str(token)
        e1_post_memory = False
        
        if e2_post_memory == True:
            e2_post_token = str(token)
        e2_post_memory = False
        
        if str(token) in e1.split():
            e1_pos = token.pos_
            e1_dep_noun = 1 if token.dep_[0]=='n' else 0
            e1_dep_adj = 1 if token.dep_[0]=='a' else 0
            e1_dep_verb = 1 if token.dep_[0]=='v' else 0
            e1_dep_prep = 1 if token.dep_[0]=='p' else 0
            e1_dep_subj = 1 if 'subj' in token.dep_ else 0
            e1_dep_obj = 1 if 'obj' in token.dep_ else 0
            e1_dep_token = str(token.head)
            
            if e1_prev_token == 'NA.':
                e1_prev_token = memory
                
            if e1_position_set == False:
                e1_position = word_count
                e1_position_set = True
                
            e1_post_memory = True
            
            
        if str(token) in e2.split(): 
            e2_pos = token.pos_
            e2_dep_noun = 1 if token.dep_[0]=='n' else 0
            e2_dep_adj = 1 if token.dep_[0]=='a' else 0
            e2_dep_verb = 1 if token.dep_[0]=='v' else 0
            e2_dep_prep = 1 if token.dep_[0]=='p' else 0
            e2_dep_subj = 1 if 'subj' in token.dep_ else 0
            e2_dep_obj = 1 if 'obj' in token.dep_ else 0
            e2_dep_token = str(token.head)
            
            if e2_prev_token == 'NA.':
                e2_prev_token = memory
                
            if e2_position_set == False:
                e2_position = word_count
                e2_position_set = True
                
            e2_post_memory = True
            
        if (str(token) in [e1.split()[-1],e2.split()[-1]] and start_inbetween_count==False) or (str(token) in [e1.split()[0],e2.split()[0]] and start_inbetween_count==True):
            start_inbetween_count = not(start_inbetween_count)
            
        if start_inbetween_count==True:
            num_verbs = num_verbs+1 if token.pos_=='VERB' else num_verbs
            num_nouns = num_nouns+1 if token.pos_=='NOUN' else num_nouns
            num_prep = num_prep+1 if token.pos_=='ADP' else num_prep
            num_adj = num_adj+1 if token.pos_=='ADJ' else num_adj
            num_words_btwn = num_words_btwn+1 
            
        memory = str(token)

    
    e1_start = old_sentence.find(e1)
    e1_end = e1_start + len(e1)
    e2_start = old_sentence.find(e2)
    e2_end = e2_start + len(e2)

    
    return pd.Series([e1_pos, e1_dep_noun, e1_dep_adj, e1_dep_verb, e1_dep_prep, e1_dep_subj, e1_dep_obj, e1_dep_token, e1_prev_token, e1_post_token, e1_start, e1_end, e1_position, e2_pos, e2_dep_noun, e2_dep_adj, e2_dep_verb, e2_dep_prep, e2_dep_subj, e2_dep_obj, e2_dep_token, e2_prev_token, e2_post_token, e2_start, e2_end, e2_position, num_verbs, num_nouns, num_prep, num_adj, num_words_btwn], index=['e1_pos', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_pos', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn'])



def get_entity_type(text):
    doc = nlp(text)
    entity_types = [ent.label_ for ent in doc.ents]
    return entity_types[0] if entity_types else None



train_df[['e1_pos', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_pos', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn']] = train_df.apply(lambda x: extract_features(x['corrected_e1'],x['corrected_e2'],x['corrected_sentence']), axis=1)
train_df

Unnamed: 0,sentence,relation,e1,e2,corrected_sentence,entity1_vectors,flag,corrected_e1,entity2_vectors,corrected_e2,...,e2_post_token,e2_start,e2_end,e2_position,num_verbs,num_nouns,num_prep,num_adj,num_words_btwn,bert_sentence
0,the system as described above has its greatest...,3,configuration,elements,the system as described above has its greatest...,"[0.13964844, 0.07373047, -0.037841797, 0.10302...",0,configuration,"[0.10888672, 0.1796875, 0.107910156, 0.0198974...",elements,...,.,98,106,16,0,2,1,0,3,the system as described above has its greatest...
1,the <e1>child</e1> was carefully wrapped and b...,18,child,cradle,the child was carefully wrapped and bound into...,"[0.16503906, -0.063964844, -0.0017852783, 0.18...",0,child,"[0.15917969, 0.076171875, 0.01953125, 0.21875,...",cradle,...,by,51,57,10,2,1,1,0,8,the child was carefully wrapped and bound into...
2,the <e1>author</e1> of a keygen uses a <e2>dis...,11,author,disassembler,the author of a keygen uses a disassembler to ...,"[0.12988281, -0.140625, 0.041748047, 0.0927734...",0,author,"[0.15332031, 0.095703125, 0.010925293, 0.05639...",disassembler,...,to,30,42,8,1,2,1,0,6,the author of a keygen uses a disassembler to ...
3,a misty <e1>ridge</e1> uprises from the <e2>su...,18,ridge,surge,a misty ridge uprises from the surge.,"[-0.09472656, -0.059814453, -0.203125, 0.06298...",0,ridge,"[0.104003906, 0.13574219, -0.10644531, -0.0260...",surge,...,.,31,36,7,0,2,1,0,4,a misty ridge uprises from the surge.
4,the <e1>student</e1> <e2>association</e2> is t...,12,student,association,the student association is the voice of the un...,"[0.036865234, 0.020141602, 0.22167969, 0.15527...",0,student,"[-0.20703125, -0.28710938, 0.03564453, -0.0859...",association,...,is,12,23,3,0,4,3,0,13,the student association is the voice of the un...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,when the <e1>notice</e1> is sent by <e2>fax</e...,18,notice,fax,"when the notice is sent by fax, the notice is ...","[-0.19824219, 0.08984375, 0.20898438, -0.16699...",0,notice,"[-0.26171875, -0.039794922, 0.12890625, -0.097...",fax,...,",",27,30,7,4,3,3,0,16,"when the notice is sent by fax, the notice is ..."
7996,the <e1>herbicide</e1> is derived from a natur...,8,herbicide,antibiotic,the herbicide is derived from a natural antibi...,"[0.20019531, 0.29492188, -0.115234375, 0.26953...",0,herbicide,"[-0.14550781, 0.20019531, 0.07861328, -0.03955...",antibiotic,...,",",40,50,8,1,1,1,1,6,the herbicide is derived from a natural antibi...
7997,"to test this, we placed a kitchen <e1>match</e...",6,match,jar,"to test this, we placed a kitchen match in the...","[-0.15527344, 0.025024414, 0.064941406, -0.124...",0,match,"[0.11621094, 0.11621094, 0.043945312, 0.267578...",jar,...,.,47,50,12,2,3,2,0,12,"to test this, we placed a kitchen match in the..."
7998,the farmers and city officials in the region h...,18,farmers,market,the farmers and city officials in the region h...,"[0.17578125, 0.07470703, -0.24707031, 0.233398...",0,farmers,"[-0.15625, -0.087890625, -0.22949219, -0.23144...",market,...,into,95,101,19,3,11,2,1,26,the farmers and city officials in the region h...


In [161]:
def bert_sentence(new_sentence, e1, e2, e1_def, e2_def):
#     new_sentence = new_sentence + ' what is the relationship between ' + e1 + ' and ' + e2 + ' where ' + e1 + ' is defined as ' + e1_def + ' and ' + e2 + ' is defined as ' + e2_def

    return new_sentence

train_df['bert_sentence'] = train_df.apply(lambda x: bert_sentence(x['corrected_sentence'],x['corrected_e1'],x['corrected_e2'],x['e1_definition'],x['e2_definition']),axis=1)
# train_df = train_df[['new_sentence','entity_diff_vector','e1_supersense','e2_supersense','hypernym','relation']]
train_df[['bert_sentence','entity_diff_vector','e1_supersense','e2_supersense','hypernym','relation']]

Unnamed: 0,bert_sentence,entity_diff_vector,e1_supersense,e2_supersense,hypernym,relation
0,the system as described above has its greatest...,"[-0.030761719, 0.10595703, 0.14575195, -0.0831...",cognition,artifact,entity,3
1,the child was carefully wrapped and bound into...,"[-0.005859375, 0.14013672, 0.021316528, 0.0341...",person,location,object,18
2,the author of a keygen uses a disassembler to ...,"[0.0234375, 0.23632812, -0.030822754, -0.03637...",person,quantity,entity,11
3,a misty ridge uprises from the surge.,"[0.19873047, 0.19555664, 0.09667969, -0.088989...",object,motion,,18
4,the student association is the voice of the un...,"[-0.24389648, -0.30725098, -0.18603516, -0.241...",person,group,entity,12
...,...,...,...,...,...,...
7995,"when the notice is sent by fax, the notice is ...","[-0.06347656, -0.12963867, -0.080078125, 0.069...",communication,communication,,18
7996,the herbicide is derived from a natural antibi...,"[-0.34570312, -0.09472656, 0.19384766, -0.3090...",substance,artifact,matter,8
7997,"to test this, we placed a kitchen match in the...","[0.27148438, 0.09118652, -0.020996094, 0.39208...",artifact,quantity,entity,6
7998,the farmers and city officials in the region h...,"[-0.33203125, -0.16259766, 0.017578125, -0.464...",person,artifact,whole,18


In [166]:
def text2vectors(text):

    try:
        if text in word2vec_model:
            return word2vec_model[text]

        elif text.replace('-','') in word2vec_model:
            return word2vec_model[text.replace('-','')]

        elif text.replace('-',' ').split()[-1] in word2vec_model:
            return word2vec_model[text.replace('-',' ').split()[-1]]

        elif text.replace('-',' ').split()[0] in word2vec_model:
            return word2vec_model[text.replace('-',' ').split()[0]]

        elif spell_checker.correction(text) in word2vec_model:
            return word2vec_model[spell_checker.correction(text)]

        else:
            return np.zeros(word2vec_model.vector_size)
    except:
        print (1, text, 2)

train_df['e1_supersense'] = train_df['e1_supersense'].apply(text2vectors)
train_df['e2_supersense'] = train_df['e2_supersense'].apply(text2vectors)
train_df['hypernym'] = train_df['hypernym'].apply(text2vectors)
train_df['e1_definition_embedding'] = train_df['e1_definition'].apply(text2vectors)
train_df['e2_definition_embedding'] = train_df['e2_definition'].apply(text2vectors)
train_df['e1_prev_token'] = train_df['e1_prev_token'].apply(text2vectors)
train_df['e2_prev_token'] = train_df['e2_prev_token'].apply(text2vectors)
train_df['e1_post_token'] = train_df['e1_post_token'].apply(text2vectors)
train_df['e2_post_token'] = train_df['e2_post_token'].apply(text2vectors)
train_df['e1_dep_token'] = train_df['e1_dep_token'].apply(text2vectors)
train_df['e2_dep_token'] = train_df['e2_dep_token'].apply(text2vectors)

In [178]:
train_df[['corrected_sentence','entity_diff_vector','entity1_vectors','e1_supersense','e2_supersense','hypernym','relation']]

Unnamed: 0,corrected_sentence,entity_diff_vector,entity1_vectors,e1_supersense,e2_supersense,hypernym,relation
0,the system as described above has its greatest...,"[-0.030761719, 0.10595703, 0.14575195, -0.0831...","[0.13964844, 0.07373047, -0.037841797, 0.10302...","[0.18066406, -0.0107421875, -0.044677734, 0.13...","[0.26953125, 0.12695312, -0.07470703, 0.051513...","[0.09326172, -0.43945312, 0.083984375, 0.04980...",3
1,the child was carefully wrapped and bound into...,"[-0.005859375, 0.14013672, 0.021316528, 0.0341...","[0.16503906, -0.063964844, -0.0017852783, 0.18...","[0.27539062, -0.24707031, 0.017211914, 0.16796...","[0.032714844, -0.096191406, 0.044189453, 0.173...","[0.29296875, -0.06298828, 0.083496094, 0.04345...",18
2,the author of a keygen uses a disassembler to ...,"[0.0234375, 0.23632812, -0.030822754, -0.03637...","[0.12988281, -0.140625, 0.041748047, 0.0927734...","[0.27539062, -0.24707031, 0.017211914, 0.16796...","[0.0146484375, 0.13378906, 0.2734375, -0.01025...","[0.09326172, -0.43945312, 0.083984375, 0.04980...",11
3,a misty ridge uprises from the surge.,"[0.19873047, 0.19555664, 0.09667969, -0.088989...","[-0.09472656, -0.059814453, -0.203125, 0.06298...","[0.29296875, -0.06298828, 0.083496094, 0.04345...","[0.068847656, 0.05078125, -0.19140625, 0.19628...","[-0.49414062, -0.12890625, 0.13476562, 0.02807...",18
4,the student association is the voice of the un...,"[-0.24389648, -0.30725098, -0.18603516, -0.241...","[0.036865234, 0.020141602, 0.22167969, 0.15527...","[0.27539062, -0.24707031, 0.017211914, 0.16796...","[-0.021972656, 0.015197754, -0.029907227, 0.00...","[0.09326172, -0.43945312, 0.083984375, 0.04980...",12
...,...,...,...,...,...,...,...
7995,"when the notice is sent by fax, the notice is ...","[-0.06347656, -0.12963867, -0.080078125, 0.069...","[-0.19824219, 0.08984375, 0.20898438, -0.16699...","[0.0026855469, -0.23339844, -0.080566406, -0.0...","[0.0026855469, -0.23339844, -0.080566406, -0.0...","[-0.49414062, -0.12890625, 0.13476562, 0.02807...",18
7996,the herbicide is derived from a natural antibi...,"[-0.34570312, -0.09472656, 0.19384766, -0.3090...","[0.20019531, 0.29492188, -0.115234375, 0.26953...","[0.111328125, -0.013000488, 0.32421875, -0.192...","[0.26953125, 0.12695312, -0.07470703, 0.051513...","[0.107910156, 0.016601562, 0.076171875, 0.0202...",8
7997,"to test this, we placed a kitchen match in the...","[0.27148438, 0.09118652, -0.020996094, 0.39208...","[-0.15527344, 0.025024414, 0.064941406, -0.124...","[0.26953125, 0.12695312, -0.07470703, 0.051513...","[0.0146484375, 0.13378906, 0.2734375, -0.01025...","[0.09326172, -0.43945312, 0.083984375, 0.04980...",6
7998,the farmers and city officials in the region h...,"[-0.33203125, -0.16259766, 0.017578125, -0.464...","[0.17578125, 0.07470703, -0.24707031, 0.233398...","[0.27539062, -0.24707031, 0.017211914, 0.16796...","[0.26953125, 0.12695312, -0.07470703, 0.051513...","[0.07519531, -0.018920898, -0.0053710938, 0.23...",18


In [168]:
len(train_df[train_df['flag']>0])
# len(train_df[train_df['flag']>1])

108

In [60]:
# train_df['new_sentence_embedding'] = train_df['new_sentence'].apply(text_to_vectors)
# train_df['e1_definition_embedding'] = train_df['e1_definition'].apply(text_to_vectors)
# train_df['e2_definition_embedding'] = train_df['e2_definition'].apply(text_to_vectors)
# X_entity1_positionstart_array = train_df['e1_start'].values.reshape(-1, 1)
# train_df['sentence_embedding'] = train_df['sentence'].apply(text_to_vectors)

In [171]:
print (train_df['e1_pos'].unique())
print (train_df['e2_pos'].unique())

train_df_cpy = train_df.copy()
train_df = train_df[(train_df['relation']!=7)&(train_df['flag']==0)]
train_df['relation'].value_counts()

['NOUN' 'PROPN' 'ADJ' 'VERB' None 'NUM' 'ADV' 'AUX']
['NOUN' None 'ADJ' 'VERB' 'PROPN' 'ADV' 'DET' 'AUX']


relation
18    1394
6      836
1      653
13     599
8      558
14     487
3      464
2      457
11     397
17     389
4      372
0      339
16     317
5      164
9      146
15     144
10      97
12      78
Name: count, dtype: int64

In [204]:
train_df[train_df['relation']==18]

Unnamed: 0,sentence,relation,e1,e2,corrected_sentence,entity1_vectors,flag,entity2_vectors,corrected_e1,corrected_e2,...,e2_start,e2_end,num_verbs,num_nouns,num_prep,num_adj,num_words_btwn,e1_definition_embedding,e2_definition_embedding,bert_sentence
1,the <e1>child</e1> was carefully wrapped and b...,18,child,cradle,the child was carefully wrapped and bound into...,"[0.16503906, -0.063964844, -0.0017852783, 0.18...",0,"[0.15917969, 0.076171875, 0.01953125, 0.21875,...",child,cradle,...,51,57,2,1,1,0,8,"[0.15527344, 0.20898438, -0.15136719, -0.03271...","[0.063964844, -0.024536133, -0.033691406, 0.05...",the child was carefully wrapped and bound into...
3,a misty <e1>ridge</e1> uprises from the <e2>su...,18,ridge,surge,a misty ridge uprises from the surge.,"[-0.09472656, -0.059814453, -0.203125, 0.06298...",0,"[0.104003906, 0.13574219, -0.10644531, -0.0260...",ridge,surge,...,31,36,0,2,1,0,4,"[0.08691406, 0.24414062, 0.05834961, 0.0299072...","[0.028442383, 0.28710938, 0.033691406, -0.1904...",a misty ridge uprises from the surge. What is ...
5,this is the sprawling <e1>complex</e1> that is...,18,complex,producer,this is the sprawling complex that is peru's l...,"[0.013977051, 0.08984375, -0.00062179565, 0.03...",0,"[-0.018188477, -0.2734375, -0.12792969, 0.0101...",complex,producer,...,53,61,0,1,0,1,6,"[0.13867188, 0.16503906, 0.1328125, 0.18066406...","[-0.0859375, -0.060791016, 0.0859375, -0.13769...",this is the sprawling complex that is peru's l...
11,their <e1>composer</e1> has sunk into <e2>obli...,18,composer,oblivion,their composer has sunk into oblivion.,"[0.22363281, -0.24902344, 0.03491211, -0.19238...",0,"[0.18359375, -0.061035156, -0.084472656, 0.261...",composer,oblivion,...,29,37,1,1,1,0,4,"[-0.1171875, -0.18066406, -0.17089844, 0.18164...","[0.09863281, 0.26757812, 0.14648438, 0.1269531...",their composer has sunk into oblivion. What is...
23,his intellectually engaging books and <e1>essa...,18,essays,history,his intellectually engaging books and essays r...,"[-0.08251953, 0.10205078, 0.083496094, 0.21289...",0,"[0.096191406, 0.13574219, 0.13574219, 0.115234...",essays,history,...,91,98,2,1,1,2,6,"[0.16015625, 0.119140625, 0.16503906, -0.10595...","[-0.045166016, -0.07519531, -0.009399414, -0.2...",his intellectually engaging books and essays r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7980,"this is incorrect, and when a <e1>minicab</e1>...",18,minicab,company,"this is incorrect, and when a minicab company ...","[-0.19042969, -0.265625, 0.027954102, 0.060546...",0,"[-0.03564453, -0.13378906, -0.07324219, -0.093...",minicab,company,...,38,45,0,1,0,0,1,"[-0.21679688, -0.1640625, -0.265625, -0.298828...","[0.30273438, -0.20214844, -0.05444336, 0.18554...","this is incorrect, and when a minicab company ..."
7982,a <e1>facilitator</e1> keeps the <e2>discussio...,18,facilitator,discussion,a facilitator keeps the discussion focused and...,"[0.028564453, -0.21386719, 0.08105469, -0.1308...",0,"[-0.0390625, 0.042236328, 0.014465332, 0.11523...",facilitator,discussion,...,24,34,1,1,0,0,3,"[0.040039062, -0.057861328, -0.12109375, 0.075...","[0.14941406, 0.01159668, -0.013305664, 0.26953...",a facilitator keeps the discussion focused and...
7994,a v8 <e1>engine</e1> mated with a manual trans...,18,engine,concept,a v8 engine mated with a manual transmission p...,"[0.33789062, 0.008300781, 0.053222656, -0.0566...",0,"[0.25585938, 0.03564453, 0.041992188, 0.193359...",engine,concept,...,68,75,1,3,1,2,9,"[0.080566406, 0.044189453, 0.19824219, -0.0024...","[0.040771484, 0.21875, -0.1796875, 0.059570312...",a v8 engine mated with a manual transmission p...
7995,when the <e1>notice</e1> is sent by <e2>fax</e...,18,notice,fax,"when the notice is sent by fax, the notice is ...","[-0.19824219, 0.08984375, 0.20898438, -0.16699...",0,"[-0.26171875, -0.039794922, 0.12890625, -0.097...",notice,fax,...,27,30,4,3,3,0,16,"[-0.07128906, 0.0625, 0.10546875, 0.0019378662...","[0.25585938, -0.022094727, 0.029052734, 0.0544...","when the notice is sent by fax, the notice is ..."


In [None]:
sentence_vector=[]
batch_size = 500
for i in range(int(np.ceil(len(train_df)/batch_size))):
    print (i)
    vectorizer = Vectorizer()
    vectorizer.run(train_df[batch_size*i:batch_size*i+batch_size]['bert_sentence'].tolist())
    vectors = vectorizer.vectors
    sentence_vector = sentence_vector+vectors
new_sentence_embedding_new = np.vstack(sentence_vector)

0
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
1
Initializing Bert distilbert-base-uncased
Vectorization done on cpu


In [170]:
from sklearn.preprocessing import OneHotEncoder



# Concatenate vectors from all batches
# new_sentence_embedding_new = np.vstack(vectors)

# new_sentence_embedding_new = np.vstack(train_df['new_sentence_embedding'])
e1_definition_embedding_new = np.vstack(train_df['e1_definition_embedding'])
e2_definition_embedding_new = np.vstack(train_df['e2_definition_embedding'])
entity_diff_vector_new = np.vstack(train_df['entity_diff_vector'])
entity1_vectors_new = np.vstack(train_df['entity1_vectors'])
entity2_vectors_new = np.vstack(train_df['entity2_vectors'])
e1_supersense_new = np.vstack(train_df['e1_supersense'])
e2_supersense_new = np.vstack(train_df['e2_supersense'])
hypernym_new = np.vstack(train_df['hypernym'])
e1_dep_token = np.vstack(train_df['e1_dep_token'])
e2_dep_token = np.vstack(train_df['e2_dep_token'])
e1_prev_token = np.vstack(train_df['e1_prev_token'])
e2_prev_token = np.vstack(train_df['e2_prev_token'])
e1_post_token = np.vstack(train_df['e1_post_token'])
e2_post_token = np.vstack(train_df['e2_post_token'])

onehot_encoder_e1_pos = OneHotEncoder(sparse=False)
e1_pos = onehot_encoder_e1_pos.fit_transform(np.array(train_df['e1_pos']).reshape(-1,1))

e1_dep_noun = np.array(train_df['e1_dep_noun']).reshape(-1,1)
e1_dep_adj = np.array(train_df['e1_dep_adj']).reshape(-1,1)
e1_dep_verb = np.array(train_df['e1_dep_verb']).reshape(-1,1)
e1_dep_prep = np.array(train_df['e1_dep_prep']).reshape(-1,1)
e1_dep_subj = np.array(train_df['e1_dep_subj']).reshape(-1,1)
e1_dep_obj = np.array(train_df['e1_dep_obj']).reshape(-1,1)

e1_start = np.array(train_df['e1_start']).reshape(-1,1)
e1_end = np.array(train_df['e1_end']).reshape(-1,1)

# e2_pos = np.array(train_df['e2_pos']).reshape(-1,1)
onehot_encoder_e2_pos = OneHotEncoder(sparse=False)
e2_pos = onehot_encoder_e2_pos.fit_transform(np.array(train_df['e2_pos']).reshape(-1,1))

e2_dep_noun = np.array(train_df['e2_dep_noun']).reshape(-1,1)
e2_dep_adj = np.array(train_df['e2_dep_adj']).reshape(-1,1)
e2_dep_verb = np.array(train_df['e2_dep_verb']).reshape(-1,1)
e2_dep_prep = np.array(train_df['e2_dep_prep']).reshape(-1,1)
e2_dep_subj = np.array(train_df['e2_dep_subj']).reshape(-1,1)
e2_dep_obj = np.array(train_df['e2_dep_obj']).reshape(-1,1)

e2_start = np.array(train_df['e2_start']).reshape(-1,1)
e2_end = np.array(train_df['e2_end']).reshape(-1,1)
num_verbs = np.array(train_df['num_verbs']).reshape(-1,1)
num_nouns = np.array(train_df['num_nouns']).reshape(-1,1)
num_prep = np.array(train_df['num_prep']).reshape(-1,1)
num_adj = np.array(train_df['num_adj']).reshape(-1,1)
num_words_btwn = np.array(train_df['num_words_btwn']).reshape(-1,1) 



# X = np.concatenate(( new_sentence_embedding_new, entity1_vectors_new, entity2_vectors_new, entity_diff_vector_new, e1_supersense_new, e2_supersense_new, hypernym_new, e1_pos,e1_dep_noun,e1_dep_adj,e1_dep_verb,e1_dep_prep,e1_dep_subj,e1_dep_obj,e1_start,e1_end,e2_pos,e2_dep_adj,e2_dep_verb,e2_dep_prep,e2_dep_subj,e2_dep_obj,e2_start,e2_end,num_verbs,num_nouns,num_prep,num_adj,num_words_btwn), axis=1)
# X = np.concatenate((new_sentence_embedding_new, hypernym_new, entity_diff_vector_new, e1_supersense_new, e2_supersense_new, e1_dep_noun,e1_dep_adj,e1_dep_verb,e1_dep_prep,e2_dep_noun, e2_dep_adj,e2_dep_verb,e2_dep_prep,num_words_btwn), axis=1)
X = np.concatenate((new_sentence_embedding_new, entity1_vectors_new, entity2_vectors_new, entity_diff_vector_new, hypernym_new, e1_dep_token, e2_dep_token, e1_prev_token, e1_post_token, e2_prev_token, e2_post_token, e1_supersense_new, e2_supersense_new,num_words_btwn, num_prep), axis=1)
y = np.array(train_df['relation'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# # Predictions
y_pred = svm_model.predict(X_test)

# # Model evaluation
print(classification_report(y_test, y_pred))



ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [47]:
# print (np.array(sentence_vector).shape)
print (len(new_sentence_embedding_new))

500


In [246]:
import torch
import torch.nn as nn
# from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
# Define your neural network model
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set hyperparameters
input_dim = X_train.shape[1]  # Dimension of input features
hidden_dim = 128  # Number of units in the hidden layer
output_dim = 19#X_train.shape[1] #len(y_train.unique())  # Number of output classes
print (output_dim)

# Initialize the model
model = NeuralNetwork(input_dim, hidden_dim, output_dim+1)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# Create a PyTorch DataLoader for training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_inputs, batch_labels in train_dataloader:
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_inputs)
#         batch_labels = batch_labels.float()
        
        # Calculate loss
        loss = criterion(outputs, batch_labels)
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    # Print average training loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Convert validation data to PyTorch tensors
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)

# Create a PyTorch DataLoader for validation data
val_dataset = TensorDataset(X_val_tensor)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Put model in evaluation mode
model.eval()

# Make predictions on validation data
all_predictions = []
with torch.no_grad():
    for batch_inputs in val_dataloader:
        outputs = model(batch_inputs[0])
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.tolist())

# Convert predictions to a pandas Series
predictions_series = pd.Series(all_predictions)


19
Epoch 1/20, Average Loss: 1.8512
Epoch 2/20, Average Loss: 1.0340
Epoch 3/20, Average Loss: 0.8037
Epoch 4/20, Average Loss: 0.6690
Epoch 5/20, Average Loss: 0.5740
Epoch 6/20, Average Loss: 0.4993
Epoch 7/20, Average Loss: 0.4352
Epoch 8/20, Average Loss: 0.3753
Epoch 9/20, Average Loss: 0.3269
Epoch 10/20, Average Loss: 0.2824
Epoch 11/20, Average Loss: 0.2486
Epoch 12/20, Average Loss: 0.2110
Epoch 13/20, Average Loss: 0.1882
Epoch 14/20, Average Loss: 0.1577
Epoch 15/20, Average Loss: 0.1328
Epoch 16/20, Average Loss: 0.1121
Epoch 17/20, Average Loss: 0.0976
Epoch 18/20, Average Loss: 0.0821
Epoch 19/20, Average Loss: 0.0699
Epoch 20/20, Average Loss: 0.0594


In [247]:
print("Validation Classification Report:")
print(classification_report(y_test, predictions_series))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.79      0.84        68
           1       0.80      0.81      0.80       131
           2       0.63      0.81      0.71        91
           3       0.68      0.56      0.61        93
           4       0.78      0.88      0.83        74
           5       0.91      0.94      0.93        33
           6       0.86      0.86      0.86       167
           8       0.75      0.68      0.71       112
           9       0.87      0.90      0.88        29
          10       0.75      0.16      0.26        19
          11       0.73      0.73      0.73        79
          12       0.89      0.50      0.64        16
          13       0.89      0.86      0.87       120
          14       0.80      0.76      0.77        98
          15       0.88      0.72      0.79        29
          16       0.67      0.71      0.69        63
          17       0.84      0.67      0.74    

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Concatenate, Flatten
from keras.utils import to_categorical
from keras.utils import to_categorical

# One-hot encode the relation column
# y_onehot = to_categorical(train_df['relation'])

# max_length = max(sentence_embeddings.shape[0], entity1_embeddings.shape[0], entity2_embeddings.shape[0])

# # Pad embeddings arrays to the same length
# sentence_embeddings_padded = np.pad(sentence_embeddings, ((0, max_length - sentence_embeddings.shape[0]), (0, 0)), mode='constant')
# entity1_embeddings_padded = np.pad(entity1_embeddings, ((0, max_length - entity1_embeddings.shape[0]), (0, 0)), mode='constant')
# entity2_embeddings_padded = np.pad(entity2_embeddings, ((0, max_length - entity2_embeddings.shape[0]), (0, 0)), mode='constant')

# # Concatenate embeddings arrays along axis=1
# features = np.concatenate([sentence_embeddings_padded, entity1_embeddings_padded, entity2_embeddings_padded], axis=1)


# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(features, df['relation'], test_size=0.2, random_state=42)

# Define CNN model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(features.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(19, activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test Accuracy:', test_acc)
# Predictions
predictions_onehot = model.predict(X_test)
predictions_onehot
# predictions_labels = label_encoder.inverse_transform(predictions_onehot.argmax(axis=1))


In [39]:
relation_name = {
0: 'Cause-Effect(e1,e2)',
1: 'Cause-Effect(e2,e1)',
2: 'Component-Whole(e1,e2)',
3: 'Component-Whole(e2,e1)',
4: 'Content-Container(e1,e2)',
5: 'Content-Container(e2,e1)',
6: 'Entity-Destination(e1,e2)',
7: 'Entity-Destination(e2,e1)',
8: 'Entity-Origin(e1,e2)',
9: 'Entity-Origin(e2,e1)',
10: 'Instrument-Agency(e1,e2)', 
11: 'Instrument-Agency(e2,e1)',
12: 'Member-Collection(e1,e2)',
13: 'Member-Collection(e2,e1)',
14: 'Message-Topic(e1,e2)',
15: 'Message-Topic(e2,e1)',
16: 'Product-Producer(e1,e2)',
17: 'Product-Producer(e2,e1)',
18: 'Other'
}

{'e1_text': 'configuration', 'e2_text': 'elements', 'context': 'The system as described above has its greatest application in an arrayed  of antenna .'}
