# NLP Relation Classification

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import numpy as np
import pickle
import tqdm
import nltk
from io import open
from google.colab import files

from pdb import set_trace as bk

# Dataset Processing

In [None]:
# Read in SemEval-2010 Task 8 Dataset
# https://github.com/sahitya0000/Relation-Classification

In [None]:
train_file = './TRAIN_FILE.TXT'

with open(train_file, 'r') as file:
  lines = [line.strip() for line in file]

for index in range(0, len(lines), 4):
  components = lines[index].split('\t')
  sentence_num = components[0]
  sentence = components[1][1:-1]
  label = lines[index+1]
  print(sentence)
  print(label)
  print()

In [None]:
pad_id = 0

In [None]:
'''
Raw input sample:

1	"The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>."
Component-Whole(e2,e1)
Comment: Not a collection: there is structure here, organisation.

'''


# Zhenda Li: I think we should process the data in a way similar to hw 3
# The following is copied from hw 3, I put 'pass' in the methods, but I think you can directly copy 
# paste them from hw3, if the 'sentence' input are in the same format 
class Vocabulary:
    def __init__(self):
        self.num_words = 100 # the vocab size, including the start tags, end tags, padding tags, ... our model will need this
    
    def get_ids_from_sentence(self, sentence):
        pass
    
    def tokenized_sentence(self, sentence):
        pass

    def decode_sentence_from_ids(self, sent_ids):
        pass

 
#  instantiate 
vocab = Vocabulary()


In [None]:
# create torch dataset, copied from hw 3
class Relation_Extraction_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, vocab, device):
        pass
        
    def __len__(self):
        pass

    def __getitem__(self, idx):
        pass


def collate_fn(data):
    """Creates mini-batch tensors
    Args:
        data: ...
    Returns: dict { "ids":     (ids_list, e1_list, e2_list, tgts_list), 
                    "tokens":  (tokens_list, e1_list, e2_list, tgts_list), 
                    "tensors": (seqs, e1s, e2s, tgts)}  
            seqs: torch int tensor of shape [padded_length, batch_size].
            padded_length = length of the longest src sequence from src_ids
            e1s: torch int tensor of shape [batch_size]
            e2s: torch int tensor of shape [batch_size]
            tgts: torch int tensor of shape [batch_size]
    """
    return { 
            "ids": (ids_list, e1_list, e2_list, tgts_list), 
            "tokens":  (tokens_list, e1_list, e2_list, tgts_list), 
            "tensors": (seqs, e1s, e2s, tgts), # for now, this tuple is the most important one
        }


In [None]:
# Create the DataLoader for all_conversations
dataset = Relation_Extraction_dataset(...)

batch_size = 32

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

# Evaluation given model/outputs

In [None]:
# codes to get F1 score given a model
def eval_model():
    pass

def print_evaluation():
    pass

# Modeling

In [None]:
# BiLSTM Relation Classifier
# https://www.aclweb.org/anthology/Y15-1009.pdf

In [None]:
# baseline model: pure work embedding, BLSTM

class Baseline_BLSTM(nn.Module):
    def __init__(self, vocab, emb_dim=300, hidden_dim=300, num_layers=1, dropout=0.2, tgts_dim=10):
        super().__init__()

        self.num_words = num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers            
        self.tgts_dim = tgts_dim

        # archeture components
        self.embed = nn.Embedding(num_words, emb_dim)
        self.blstm = nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, bidirectional=True) 
        self.linear = nn.Linear(4*hidden_dim, tgts_dim) # 4 = 2 * 2, bidirectional and concate, 

        self.loss_fn = nn.CrossEntropyLoss(reduction='mean')


    def compute_loss(self, seqs, e1s, e2s, tgts):
        '''
        Args:
            seqs: int tensor [padded_length, batch_size]
            e1s: int tensor [batch_size]
            e2s: int tensor [batch_size]
            tgts: int tensor [batch_size]
        Returns:
            loss: torch scaler
        '''
        logits = self.forward(seqs, e1s, e2s) # [batch, tgts_dim]
        return self.loss_fn(logits, tgts)


    def forward(self, seqs, e1s, e2s):
        '''
        Args:
            seqs: int tensor [padded_length, batch_size]
            e1s: int tensor [batch_size]
            e2s: int tensor [batch_size]
        Returns:
            logits: tensor []
        '''
        # Compute a tensor containing the length of each source sequence.
        seq_lengths = torch.sum(seqs != pad_id, axis=0).cpu()
        batch_size = seqs.size(1)
        batch_indices = torch.tensor([i for i in range(batch_size)])

        embeddings = self.embed(seqs)
        packed = pack_padded_sequence(embeddings, seq_lengths) # enforce_sorted=False

        rnn_out, _ = self.blstm(packed)
        unpacked, _ = pad_packed_sequence(rnn_out) # unpacked: [padded_length, batch, 2*hidden_dim]

        e1_hiddens = unpacked[e1s, batch_indices, :] # [batch, 2*hidden_dim]
        e2_hiddens = unpacked[e2s, batch_indices, :] # [batch, 2*hidden_dim]

        linear_in = torch.cat([e1_hiddens, e2_hiddens], dim=1)
        
        return self.linear(linear_in) # [batch, tgts_dim]



In [None]:
baseline_model = Baseline_BLSTM(vocab, emb_dim=300, hidden_dim=300, num_layers=1, dropout=0.2, tgts_dim=10)

In [None]:
# TODO: testing starts here, remove below later
fake_padded_length = 21
fake_batch_size = 32
fake_input = {
    'seqs': torch.randint(0, vocab.num_words, (fake_padded_length, fake_batch_size)), 
    'e1s': torch.randint(0, fake_padded_length, (fake_batch_size, )), 
    'e2s': torch.randint(0, fake_padded_length, (fake_batch_size, )),
    'tgts': torch.randint(0, 10, (fake_batch_size, )),
    }

fake_out = baseline_model(fake_input['seqs'], fake_input['e1s'], fake_input['e2s'])
fake_loss = baseline_model.compute_loss(fake_input['seqs'], fake_input['e1s'], fake_input['e2s'], fake_input['tgts'])

print(fake_loss)
# TODO: testing ends here, remove above later

Train model

In [None]:
# modified from hw 3
def train(model, data_loader, num_epochs, model_file, optimizer = None):
    if !optimizer: 
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                seqs, e1s, e2s, tgts = batch_data["tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(seqs, e1s, e2s, tgts)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
            
            print('total loss: ', total_loss)

    # Save the model after training         
    torch.save(model.state_dict(), model_file)


In [None]:
# hyper parameters
learning_rate = 0.0005
num_epochs = 10
model_file = 'baseline_model.pt'
optimizer = torch.optim.Adam(baseline_model.parameters(), lr = learning_rate)

train(baseline_model, data_loader, num_epochs, model_file, optimizer)


# Evaluation with Pretrained Models and OpenIE Methods

In [None]:
The man took <e1>driver</e1>'s <e2>keys</e2> until the arrival of police, thus preventing him from leaving.
Entity-Origin(e2,e1)

Another example is the hammered dulcimer, where the <e1>player</e1> holds the <e2>hammers</e2>.
Instrument-Agency(e2,e1)

The <e1>campus</e1> comprises the most noteworthy <e2>buildings</e2> including the Rector Tower, the Central Library and the University Olympic Stadium used for the 1968 Olympic Games and the 1986 soccer World Cup.
Component-Whole(e2,e1)

This <e1>generation</e1> of unregenerated <e2>vipers</e2> was still perverse, stiff-necked, and hardened in their iniquity.
Member-Collection(e2,e1)

In one of the scenes when Robert and Francessca are talking in front of his truck, a reflection of one of the movie set lights is seen on the <e1>hood</e1> of the <e2>truck</e2>.
Component-Whole(e1,e2)

In [2]:
sentence1 = "The man took driver's keys until the arrival of police, thus preventing him from leaving."
sentence2 = "Another example is the hammered dulcimer, where the player holds the hammers."
sentence3 = "The campus comprises the most noteworthy buildings including the Rector Tower, the Central Library and the University Olympic Stadium used for the 1968 Olympic Games and the 1986 soccer World Cup."
sentence4 = "This generation of unregenerated vipers was still perverse, stiff-necked, and hardened in their iniquity."
sentence5 = "In one of the scenes when Robert and Francessca are talking in front of his truck, a reflection of one of the movie set lights is seen on the hood of the truck."

### MinIE: Open Information Extraction System

In [1]:
import os
os.environ['CLASSPATH'] = './minie/minie-0.0.1-SNAPSHOT.jar'
from miniepy import *

minie = MinIE()

In [9]:
triples = [p.triple for p in minie.get_propositions(sentence1)]

print("Original text:")
print('\t{}\n'.format(sentence1))
print('\tEntities: {}, {}'.format('driver', 'keys'))
print('\tGold Relation: {}\n'.format('Instrument-Agency'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Original text:
	The man took driver's keys until the arrival of police, thus preventing him from leaving.

	Entities: driver, keys
	Gold Relation: Instrument-Agency

Extracted triples:
	('man', "took driver 's keys until", 'arrival of police thus preventing him from leaving')
	('man', "took driver 's keys until arrival of", 'police')
	('man', 'be preventing him from', 'leaving')
	('man', 'be preventing', 'him')
	('driver', 'has', 'keys')


In [10]:
triples = [p.triple for p in minie.get_propositions(sentence2)]

print("Original text:")
print('\t{}\n'.format(sentence2))
print('\tEntities: {}, {}'.format('player', 'hammers'))
print('\tGold Relation: {}\n'.format('Entity-Origin'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Original text:
	Another example is the hammered dulcimer, where the player holds the hammers.

	Entities: player, hammers
	Gold Relation: Entity-Origin

Extracted triples:
	('example', 'is', 'hammered dulcimer')
	('player', 'holds hammers', 'hammered dulcimer')


In [11]:
triples = [p.triple for p in minie.get_propositions(sentence3)]

print("Original text:")
print('\t{}\n'.format(sentence3))
print('\tEntities: {}, {}'.format('campus', 'buildings'))
print('\tGold Relation: {}\n'.format('Component-Whole'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Original text:
	The campus comprises the most noteworthy buildings including the Rector Tower, the Central Library and the University Olympic Stadium used for the 1968 Olympic Games and the 1986 soccer World Cup.

	Entities: campus, buildings
	Gold Relation: Component-Whole

Extracted triples:
	('most noteworthy buildings including Rector Tower', 'used for', '1968 Olympic Games')
	('most noteworthy buildings including Rector Tower', 'used for', '1986 soccer World Cup')
	('most noteworthy buildings including Central Library', 'used for', '1968 Olympic Games')
	('most noteworthy buildings including Central Library', 'used for', '1986 soccer World Cup')
	('most noteworthy buildings including University Olympic Stadium', 'used for', '1968 Olympic Games')
	('most noteworthy buildings including University Olympic Stadium', 'used for', '1986 soccer World Cup')


### Stanford OpenIE

In [12]:
from openie import StanfordOpenIE

In [13]:
triples = []
with StanfordOpenIE() as client:
    for triple in client.annotate(sentence1):
        triples.append(triple)
        
print("Original text:")
print('\t{}\n'.format(sentence1))
print('\tEntities: {}, {}'.format('driver', 'keys'))
print('\tGold Relation: {}\n'.format('Instrument-Agency'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-5ac9ba2e4527465f.props -preload openie
Original text:
	The man took driver's keys until the arrival of police, thus preventing him from leaving.

	Entities: driver, keys
	Gold Relation: Instrument-Agency

Extracted triples:
	{'subject': 'man', 'relation': 'took', 'object': "driver 's keys"}
	{'subject': 'man', 'relation': 'thus preventing', 'object': 'him'}
	{'subject': 'man', 'relation': 'preventing', 'object': 'him'}
	{'subject': 'driver', 'relation': 'has', 'object': 'keys'}


In [17]:
triples = []
with StanfordOpenIE() as client:
    for triple in client.annotate(sentence2):
        triples.append(triple)
        
print("Original text:")
print('\t{}\n'.format(sentence2))
print('\tEntities: {}, {}'.format('player', 'hammers'))
print('\tGold Relation: {}\n'.format('Entity-Origin'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-70814b10053c4b95.props -preload openie
Original text:
	Another example is the hammered dulcimer, where the player holds the hammers.

	Entities: player, hammers
	Gold Relation: Entity-Origin

Extracted triples:
	{'subject': 'player', 'relation': 'holds', 'object': 'hammers'}


In [18]:
triples = []
with StanfordOpenIE() as client:
    for triple in client.annotate(sentence3):
        triples.append(triple)
        
print("Original text:")
print('\t{}\n'.format(sentence3))
print('\tEntities: {}, {}'.format('campus', 'buildings'))
print('\tGold Relation: {}\n'.format('Component-Whole'))

print("Extracted triples:")
for t in triples:
    print("\t{}".format(t))

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-dc0f629111cb462a.props -preload openie
Original text:
	The campus comprises the most noteworthy buildings including the Rector Tower, the Central Library and the University Olympic Stadium used for the 1968 Olympic Games and the 1986 soccer World Cup.

	Entities: campus, buildings
	Gold Relation: Component-Whole

Extracted triples:


### Hearst Patterns

In [1]:
from hearstPatterns.hearstPatterns import HearstPatterns

In [2]:
hp = HearstPatterns(extended=True)

In [3]:
hp.find_hyponyms('I am a chair')

[]

In [23]:
hp.find_hyponyms(sentence2)

[]

### OpenNRE: Neural Relation Extraction

In [4]:
import pandas as pd
import spacy
import opennre
import re

In [5]:
nlp = spacy.load("en_core_web_lg")
model = opennre.get_model('wiki80_bertentity_softmax')

2021-04-27 16:26:35,367 - root - INFO - Loading BERT pre-trained checkpoint.


In [42]:
e1 = re.search(r'driver', sentence1).span()
e2 = re.search(r'keys', sentence1).span()
pred = model.infer({'text': sentence1, 'h': {'pos': e1}, 't': {'pos': e2}})

print("Original text:")
print('\t{}\n'.format(sentence1))
print('Entities: {}, {}'.format('driver', 'keys'))
print('Gold Relation: {}\n'.format('Instrument-Agency'))

print("Predicted Relation: {}".format(pred))

Original text:
	The man took driver's keys until the arrival of police, thus preventing him from leaving.

Entities: driver, keys
Gold Relation: Instrument-Agency

Predicted Relation: ('said to be the same as', 0.11592283844947815)


In [45]:
e1 = re.search(r'player', sentence2).span()
e2 = re.search(r'hammers', sentence2).span()
pred = model.infer({'text': sentence2, 'h': {'pos': e1}, 't': {'pos': e2}})

print("Original text:")
print('\t{}\n'.format(sentence2))
print('Entities: {}, {}'.format('player', 'hammers'))
print('Gold Relation: {}\n'.format('Entity-Origin'))

print("Predicted Relation: {}".format(pred))

Original text:
	Another example is the hammered dulcimer, where the player holds the hammers.

Entities: player, hammers
Gold Relation: Entity-Origin

Predicted Relation: ('instrument', 0.41270914673805237)


In [46]:
e1 = re.search(r'campus', sentence3).span()
e2 = re.search(r'buildings', sentence3).span()
pred = model.infer({'text': sentence3, 'h': {'pos': e1}, 't': {'pos': e2}})

print("Original text:")
print('\t{}\n'.format(sentence3))
print('Entities: {}, {}'.format('campus', 'buildings'))
print('Gold Relation: {}\n'.format('Component-Whole'))

print("Predicted Relation: {}".format(pred))

Original text:
	The campus comprises the most noteworthy buildings including the Rector Tower, the Central Library and the University Olympic Stadium used for the 1968 Olympic Games and the 1986 soccer World Cup.

Entities: campus, buildings
Gold Relation: Component-Whole

Predicted Relation: ('instance of', 0.9435627460479736)


In [None]:
relation_to_idx = {
    "Cause-Effect": 0,
    "Instrument-Agency": 1,
    "Product-Producer": 2,
    "Content-Container": 3,
    "Entity-Origin": 4,
    "Entity-Destination": 5,
    "Component-Whole": 6,
    "Member-Collection": 7,
    "Message-Topic": 8,
    "Other": 9,
}

In [7]:
model.rel2id

{'place served by transport hub': 0,
 'mountain range': 1,
 'religion': 2,
 'participating team': 3,
 'contains administrative territorial entity': 4,
 'head of government': 5,
 'country of citizenship': 6,
 'original network': 7,
 'heritage designation': 8,
 'performer': 9,
 'participant of': 10,
 'position held': 11,
 'has part': 12,
 'location of formation': 13,
 'located on terrain feature': 14,
 'architect': 15,
 'country of origin': 16,
 'publisher': 17,
 'director': 18,
 'father': 19,
 'developer': 20,
 'military branch': 21,
 'mouth of the watercourse': 22,
 'nominated for': 23,
 'movement': 24,
 'successful candidate': 25,
 'followed by': 26,
 'manufacturer': 27,
 'instance of': 28,
 'after a work by': 29,
 'member of political party': 30,
 'licensed to broadcast to': 31,
 'headquarters location': 32,
 'sibling': 33,
 'instrument': 34,
 'country': 35,
 'occupation': 36,
 'residence': 37,
 'work location': 38,
 'subsidiary': 39,
 'participant': 40,
 'operator': 41,
 'characters