In [5]:
%cd /home/aditya/git/columbia_advanced_database_project_2/SpanBERT

/home/aditya/git/columbia_advanced_database_project_2/SpanBERT


In [6]:
import spacy
from spanbert import SpanBERT
from spacy_help_functions import get_entities, create_entity_pairs

In [7]:
raw_text = "Zuckerberg attended Harvard University, where he launched the Facebook social networking service from his dormitory room on February 4, 2004, with college roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes. Bill Gates stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadella. "

In [8]:
# TODO: filter entities of interest based on target relation
entities_of_interest = ["ORGANIZATION", "PERSON", "LOCATION", "CITY", "STATE_OR_PROVINCE", "COUNTRY"]

In [9]:
# Load spacy model
nlp = spacy.load("en_core_web_lg") 
# Load pre-trained SpanBERT model
spanbert = SpanBERT("./pretrained_spanbert")  


100%|██████████| 213450/213450 [00:00<00:00, 7125880.03B/s]


Loading pre-trained spanBERT from ./pretrained_spanbert


In [10]:
# Apply spacy model to raw text (to split to sentences, tokenize, extract entities etc.)
doc = nlp(raw_text)  

In [13]:
sentence = next(doc.sents)

In [14]:
print("\n\nProcessing entence: {}".format(sentence))



Processing entence: Zuckerberg attended Harvard University, where he launched the Facebook social networking service from his dormitory room on February 4, 2004, with college roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes.


In [15]:
print("Tokenized sentence: {}".format([token.text for token in sentence]))

Tokenized sentence: ['Zuckerberg', 'attended', 'Harvard', 'University', ',', 'where', 'he', 'launched', 'the', 'Facebook', 'social', 'networking', 'service', 'from', 'his', 'dormitory', 'room', 'on', 'February', '4', ',', '2004', ',', 'with', 'college', 'roommates', 'Eduardo', 'Saverin', ',', 'Andrew', 'McCollum', ',', 'Dustin', 'Moskovitz', ',', 'and', 'Chris', 'Hughes', '.']


In [17]:
ents = get_entities(sentence, entities_of_interest)
ents

[('Zuckerberg', 'PERSON'),
 ('Harvard University', 'ORGANIZATION'),
 ('Facebook', 'ORGANIZATION'),
 ('February 4, 2004', 'DATE'),
 ('Eduardo Saverin', 'PERSON'),
 ('Andrew McCollum', 'PERSON'),
 ('Dustin Moskovitz', 'PERSON'),
 ('Chris Hughes', 'PERSON')]

In [18]:
print("spaCy extracted entities: {}".format(ents))

spaCy extracted entities: [('Zuckerberg', 'PERSON'), ('Harvard University', 'ORGANIZATION'), ('Facebook', 'ORGANIZATION'), ('February 4, 2004', 'DATE'), ('Eduardo Saverin', 'PERSON'), ('Andrew McCollum', 'PERSON'), ('Dustin Moskovitz', 'PERSON'), ('Chris Hughes', 'PERSON')]


In [21]:
candidate_pairs = []
sentence_entity_pairs = create_entity_pairs(sentence, entities_of_interest)
sentence_entity_pairs

[(['Zuckerberg', 'attended', 'Harvard', 'University', ','],
  ('Zuckerberg', 'PERSON', (0, 0)),
  ('Harvard University', 'ORGANIZATION', (2, 3))),
 (['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ','],
  ('Zuckerberg', 'PERSON', (0, 0)),
  ('Facebook', 'ORGANIZATION', (9, 9))),
 (['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ',',
   '2004',
   ',',
   'with',
   'college',
   'roommates',
   'Eduardo',
   'Saverin',
   ','],
  ('Zuckerberg', 'PERSON', (0, 0)),
  ('Eduardo Saverin', 'PERSON', (26, 27))),
 (['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',

In [23]:
for ep in sentence_entity_pairs:
    # TODO: keep subject-object pairs of the right type for the target relation (e.g., Person:Organization for the "Work_For" relation)
    candidate_pairs.append({"tokens": ep[0], "subj": ep[1], "obj": ep[2]})  # e1=Subject, e2=Object
    candidate_pairs.append({"tokens": ep[0], "subj": ep[2], "obj": ep[1]})  # e1=Object, e2=Subject

candidate_pairs

[{'tokens': ['Zuckerberg', 'attended', 'Harvard', 'University', ','],
  'subj': ('Zuckerberg', 'PERSON', (0, 0)),
  'obj': ('Harvard University', 'ORGANIZATION', (2, 3))},
 {'tokens': ['Zuckerberg', 'attended', 'Harvard', 'University', ','],
  'subj': ('Harvard University', 'ORGANIZATION', (2, 3)),
  'obj': ('Zuckerberg', 'PERSON', (0, 0))},
 {'tokens': ['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ','],
  'subj': ('Zuckerberg', 'PERSON', (0, 0)),
  'obj': ('Facebook', 'ORGANIZATION', (9, 9))},
 {'tokens': ['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ','],
  'subj': ('Fa

In [24]:
# Classify Relations for all Candidate Entity Pairs using SpanBERT
candidate_pairs = [p for p in candidate_pairs if not p["subj"][1] in ["DATE", "LOCATION"]]  # ignore subject entities with date/location type
print("Candidate entity pairs:")
for p in candidate_pairs:
    print("Subject: {}\tObject: {}".format(p["subj"][0:2], p["obj"][0:2]))
print("Applying SpanBERT for each of the {} candidate pairs. This should take some time...".format(len(candidate_pairs)))


Candidate entity pairs:
Subject: ('Zuckerberg', 'PERSON')	Object: ('Harvard University', 'ORGANIZATION')
Subject: ('Harvard University', 'ORGANIZATION')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Zuckerberg', 'PERSON')	Object: ('Facebook', 'ORGANIZATION')
Subject: ('Facebook', 'ORGANIZATION')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Zuckerberg', 'PERSON')	Object: ('Eduardo Saverin', 'PERSON')
Subject: ('Eduardo Saverin', 'PERSON')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Zuckerberg', 'PERSON')	Object: ('Andrew McCollum', 'PERSON')
Subject: ('Andrew McCollum', 'PERSON')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Zuckerberg', 'PERSON')	Object: ('Dustin Moskovitz', 'PERSON')
Subject: ('Dustin Moskovitz', 'PERSON')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Zuckerberg', 'PERSON')	Object: ('Chris Hughes', 'PERSON')
Subject: ('Chris Hughes', 'PERSON')	Object: ('Zuckerberg', 'PERSON')
Subject: ('Harvard University', 'ORGANIZATION')	Object: ('Facebook', 'ORGANIZATION')
Subject: ('Face

In [26]:
relation_preds = spanbert.predict(candidate_pairs)  # get predictions: list of (relation, confidence) pairs


In [28]:
relation_preds

[('no_relation', 0.9982001),
 ('no_relation', 0.99973303),
 ('no_relation', 0.965565),
 ('no_relation', 0.9991116),
 ('no_relation', 0.99960905),
 ('no_relation', 0.9988648),
 ('no_relation', 0.99946415),
 ('no_relation', 0.9993222),
 ('no_relation', 0.99741983),
 ('no_relation', 0.99873334),
 ('no_relation', 0.9987677),
 ('no_relation', 0.9981373),
 ('no_relation', 0.9991116),
 ('no_relation', 0.99746644),
 ('no_relation', 0.9997883),
 ('no_relation', 0.96516734),
 ('no_relation', 0.999672),
 ('no_relation', 0.94366294),
 ('no_relation', 0.9994394),
 ('per:schools_attended', 0.5659972),
 ('no_relation', 0.9994556),
 ('per:schools_attended', 0.5837169),
 ('no_relation', 0.99886674),
 ('no_relation', 0.9284509),
 ('no_relation', 0.99821824),
 ('no_relation', 0.9071047),
 ('no_relation', 0.9977081),
 ('per:employee_of', 0.8343476),
 ('no_relation', 0.99783653),
 ('per:employee_of', 0.63332117),
 ('no_relation', 0.97285414),
 ('no_relation', 0.99438155),
 ('no_relation', 0.9804151),
 ('no

In [29]:
candidate_pairs

[{'tokens': ['Zuckerberg', 'attended', 'Harvard', 'University', ','],
  'subj': ('Zuckerberg', 'PERSON', (0, 0)),
  'obj': ('Harvard University', 'ORGANIZATION', (2, 3))},
 {'tokens': ['Zuckerberg', 'attended', 'Harvard', 'University', ','],
  'subj': ('Harvard University', 'ORGANIZATION', (2, 3)),
  'obj': ('Zuckerberg', 'PERSON', (0, 0))},
 {'tokens': ['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ','],
  'subj': ('Zuckerberg', 'PERSON', (0, 0)),
  'obj': ('Facebook', 'ORGANIZATION', (9, 9))},
 {'tokens': ['Zuckerberg',
   'attended',
   'Harvard',
   'University',
   ',',
   'where',
   'he',
   'launched',
   'the',
   'Facebook',
   'social',
   'networking',
   'service',
   'from',
   'his',
   'dormitory',
   'room',
   'on',
   'February',
   '4',
   ','],
  'subj': ('Fa

In [27]:
print("\nExtracted relations:")
for ex, pred in list(zip(candidate_pairs, relation_preds)):
    print("\tSubject: {}\tObject: {}\tRelation: {}\tConfidence: {:.2f}".format(ex["subj"][0], ex["obj"][0], pred[0], pred[1]))



Extracted relations:
	Subject: Zuckerberg	Object: Harvard University	Relation: no_relation	Confidence: 1.00
	Subject: Harvard University	Object: Zuckerberg	Relation: no_relation	Confidence: 1.00
	Subject: Zuckerberg	Object: Facebook	Relation: no_relation	Confidence: 0.97
	Subject: Facebook	Object: Zuckerberg	Relation: no_relation	Confidence: 1.00
	Subject: Zuckerberg	Object: Eduardo Saverin	Relation: no_relation	Confidence: 1.00
	Subject: Eduardo Saverin	Object: Zuckerberg	Relation: no_relation	Confidence: 1.00
	Subject: Zuckerberg	Object: Andrew McCollum	Relation: no_relation	Confidence: 1.00
	Subject: Andrew McCollum	Object: Zuckerberg	Relation: no_relation	Confidence: 1.00
	Subject: Zuckerberg	Object: Dustin Moskovitz	Relation: no_relation	Confidence: 1.00
	Subject: Dustin Moskovitz	Object: Zuckerberg	Relation: no_relation	Confidence: 1.00
	Subject: Zuckerberg	Object: Chris Hughes	Relation: no_relation	Confidence: 1.00
	Subject: Chris Hughes	Object: Zuckerberg	Relation: no_relation

In [None]:
for sentence in doc.sents:  
    print("\n\nProcessing entence: {}".format(sentence))
    print("Tokenized sentence: {}".format([token.text for token in sentence]))
    ents = get_entities(sentence, entities_of_interest)
    print("spaCy extracted entities: {}".format(ents))
    # create entity pairs
    candidate_pairs = []
    sentence_entity_pairs = create_entity_pairs(sentence, entities_of_interest)
    for ep in sentence_entity_pairs:
        # TODO: keep subject-object pairs of the right type for the target relation (e.g., Person:Organization for the "Work_For" relation)
        candidate_pairs.append({"tokens": ep[0], "subj": ep[1], "obj": ep[2]})  # e1=Subject, e2=Object
        candidate_pairs.append({"tokens": ep[0], "subj": ep[2], "obj": ep[1]})  # e1=Object, e2=Subject
    

    # Classify Relations for all Candidate Entity Pairs using SpanBERT
    candidate_pairs = [p for p in candidate_pairs if not p["subj"][1] in ["DATE", "LOCATION"]]  # ignore subject entities with date/location type
    print("Candidate entity pairs:")
    for p in candidate_pairs:
        print("Subject: {}\tObject: {}".format(p["subj"][0:2], p["obj"][0:2]))
    print("Applying SpanBERT for each of the {} candidate pairs. This should take some time...".format(len(candidate_pairs)))

    if len(candidate_pairs) == 0:
        continue
    
    relation_preds = spanbert.predict(candidate_pairs)  # get predictions: list of (relation, confidence) pairs

    # Print Extracted Relations
    print("\nExtracted relations:")
    for ex, pred in list(zip(candidate_pairs, relation_preds)):
        print("\tSubject: {}\tObject: {}\tRelation: {}\tConfidence: {:.2f}".format(ex["subj"][0], ex["obj"][0], pred[0], pred[1]))

        # TODO: focus on target relations
        # '1':"per:schools_attended"
        # '2':"per:employee_of"
        # '3':"per:cities_of_residence"
        # '4':"org:top_members/employees"

