In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle
from pathlib import Path
import time, os, csv, srsly, collections
import spacy
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
import re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /Users/ash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ash/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# creating a folder to hold the data files, model results, output etc.
model_dir_name = "custom_entity_extractor_spacy_change_dir"
model_dir_parent = "/Users/ash/Desktop"
output_dir = os.path.join(model_dir_parent, model_dir_name)
os.makedirs(output_dir, exist_ok=True)
print("created", output_dir)

created /Users/ash/Desktop/custom_entity_extractor_spacy_change_dir


In [3]:
label = ["change_direction", "base"]

In [4]:
def prep_data(labels):
    file_name_answers = "main_3_per_cluster_download.66081fcf-3ef5-48ea-97e0-49298d29b477"
    file_path_answers = "/Users/ash/Desktop/" + file_name_answers + ".jsonl"
    data = srsly.read_jsonl(file_path_answers)
    final_sent = []

    for entry in data:
        if "text" in entry:
            text = entry["text"]
        label_arr = []
        label_tup = ()
        if entry['answer'] == "accept":
            if entry['_session_id'] == "main_3_per_cluster-Kameron":
                for relation in entry['spans']:
                    if ("label" in relation) and ("start" in relation) and ("end" in relation):
                        child_span_start = relation["start"]
                        child_span_end = relation["end"]
                        word = text[child_span_start:child_span_end]
                        if relation["label"] in labels:
                            tmp_tuple = (child_span_start, child_span_end, relation["label"])
                            label_arr.append(tmp_tuple)
                if len(label_arr) > 0:
                    label_tup = (text, {"entities": label_arr})
                    final_sent.append(label_tup)
                else:
                    pass
                    #print(text)
    return final_sent
                    
final_sent = prep_data(label)
training_data_index = int(len(final_sent)*0.7)

with open(os.path.join(output_dir, 'training_data.txt'), 'wb') as file:
    pickle.dump(final_sent[:training_data_index], file)
    
with open(os.path.join(output_dir, 'testing_data.txt'), 'wb') as file:
    pickle.dump(final_sent[training_data_index:], file)

In [5]:
with open (os.path.join(output_dir, 'training_data.txt'), 'rb') as file:
    TRAIN_DATA = pickle.load(file)

with open (os.path.join(output_dir, 'testing_data.txt'), 'rb') as file:
    TEST_DATA = pickle.load(file)

In [6]:
TRAIN_DATA

[('These risks are crop insurance, health care, wildfire suppression, hurricane-related disaster relief, and federal facility flood risk, all of which are anticipated to cost billions of dollars more by the end of the century due to the impacts of climate change.23 IMPLIED_BASE IMPLIED_BASE',
  {'entities': [(21, 30, 'base'),
    (32, 43, 'base'),
    (45, 65, 'base'),
    (85, 100, 'base'),
    (123, 128, 'base'),
    (192, 196, 'change_direction'),
    (245, 262, 'base')]}),
 ('The IPCC reported that Alexandria’s beaches would be submerged even with a 0.5-metre sea-level rise, while 8 million people would be displaced by flooding in Alexandria and the Nile Delta if no protective measures are taken. IMPLIED_BASE IMPLIED_BASE',
  {'entities': [(36, 43, 'base'),
    (85, 88, 'base'),
    (95, 99, 'change_direction'),
    (117, 123, 'base'),
    (133, 142, 'change_direction'),
    (146, 154, 'base')]}),
 ('(Later, a severe mid-summer Arctic heat wave contributed to historic melting of th

In [7]:
TEST_DATA

[('Around the wine-growing world, smart producers have contemplated and experimented with adaptations, not only to hotter summers, but also to warmer winters, droughts and the sort of unexpected, sometimes violent events that stem from climate change: freak hailstorms, spring frosts, flooding and forest fires, just to name a few. IMPLIED_BASE IMPLIED_BASE',
  {'entities': [(112, 118, 'change_direction'),
    (119, 126, 'base'),
    (140, 146, 'change_direction'),
    (147, 154, 'base'),
    (156, 164, 'base'),
    (233, 247, 'base'),
    (255, 265, 'base'),
    (274, 280, 'base'),
    (282, 290, 'base'),
    (295, 307, 'base')]}),
 ('“Extreme heat, powerful storms and floods, year-round wildfires, droughts, and other climate-related events have already caused thousands of deaths and displaced tens of thousands of people in the U.S. from their homes, with significant personal loss and mental health impacts especially for first responders and children,” the letter warns. IMPLIED_BASE IMP

# Finding POS Tags patterns

In [8]:
def get_ent_from_sent(x, label):
    ent = []
    sent = x[0].replace('IMPLIED_BASE', '')

    for y in x[1]['entities']:
        if y[2] == label:
            start_index_base = y[0]
            end_index_base = y[1]
            word_base = sent[start_index_base:end_index_base]
            ent.append(word_base)
    return ent

In [9]:
def get_insights(data, label):
    pos_patterns = []
    freq = {}
    count = 0
    for x in data:
        words = []
        sent = x[0].replace('IMPLIED_BASE', '')
        for e in x[1]['entities']:
            if e[2] == label:
                start_index = e[0]
                end_index = e[1]
                word = sent[start_index:end_index]
                words.append(word)
        doc = nlp(sent)
        
        for t in doc:
            for w in words:
                if len(w.split(" ")) > 0:
                    for w1 in w.split(" "):
                        if t.text == w1 and t.pos_ in ["VERB", "ADV", "ADJ"]:
                            #print(t.text, t.pos_, t.dep_, t.head.text, t.head.pos_, [child for child in t.children])
                            if [t.pos_, t.dep_, t.head.pos_] not in pos_patterns:
                                pos_patterns.append([t.pos_, t.dep_, t.head.pos_])
                            if (t.pos_, t.dep_, t.head.pos_) in freq:
                                freq[(t.pos_, t.dep_, t.head.pos_)] += 1
                            else:
                                freq[(t.pos_, t.dep_, t.head.pos_)] = 1
                else:
                    if t.text == w and t.pos_ in ["VERB", "ADV", "ADJ"]:
                        #print(t.text, t.pos_, t.dep_, t.head.text, t.head.pos_, [child for child in t.children])
                        if [t.pos_, t.dep_, t.head.pos_] not in pos_patterns:
                            pos_patterns.append([t.pos_, t.dep_, t.head.pos_])
                        if (t.pos_, t.dep_, t.head.pos_) in freq:
                            freq[(t.pos_, t.dep_, t.head.pos_)] += 1
                        else:
                            freq[(t.pos_, t.dep_, t.head.pos_)] = 1
    
    # deleting keys where value is <=1
    freq = {k: v for k, v in freq.items() if v > 1}
    
    # sorting the dict
    freq = {k: v for k, v in sorted(freq.items(), key=lambda v: v[1], reverse=True)}
    return pos_patterns, freq

In [10]:
insight_label = "change_direction"
print(get_insights(TRAIN_DATA, insight_label)[1])
print("------")
print(get_insights(TEST_DATA, insight_label)[1])
print()

pos_p = get_insights(TEST_DATA, insight_label)[0]
#for x in get_insights(TEST_DATA)[0]:
#    if x not in get_insights(TRAIN_DATA)[0]:
#        print(x)

{('ADJ', 'amod', 'NOUN'): 62, ('VERB', 'ROOT', 'VERB'): 40, ('VERB', 'amod', 'NOUN'): 30, ('ADV', 'advmod', 'ADJ'): 26, ('VERB', 'ccomp', 'VERB'): 20, ('VERB', 'advcl', 'VERB'): 18, ('VERB', 'relcl', 'NOUN'): 18, ('VERB', 'conj', 'VERB'): 18, ('VERB', 'xcomp', 'VERB'): 12, ('ADJ', 'acomp', 'AUX'): 11, ('VERB', 'acl', 'NOUN'): 9, ('VERB', 'pcomp', 'ADP'): 8, ('ADV', 'advmod', 'VERB'): 7, ('ADJ', 'conj', 'ADJ'): 5, ('ADJ', 'ccomp', 'VERB'): 5, ('ADV', 'dobj', 'VERB'): 4, ('ADJ', 'conj', 'ADV'): 4, ('ADV', 'amod', 'NOUN'): 4, ('VERB', 'relcl', 'PROPN'): 3, ('VERB', 'ccomp', 'ADJ'): 3, ('ADV', 'advmod', 'ADV'): 3, ('ADV', 'ccomp', 'VERB'): 2, ('VERB', 'conj', 'NOUN'): 2, ('VERB', 'ccomp', 'AUX'): 2, ('VERB', 'pobj', 'ADP'): 2, ('ADJ', 'acomp', 'VERB'): 2}
------
{('VERB', 'ROOT', 'VERB'): 22, ('ADJ', 'amod', 'NOUN'): 17, ('VERB', 'ccomp', 'VERB'): 10, ('VERB', 'amod', 'NOUN'): 9, ('VERB', 'conj', 'VERB'): 8, ('ADV', 'advmod', 'ADJ'): 8, ('VERB', 'relcl', 'NOUN'): 8, ('VERB', 'xcomp', 'VERB

In [11]:
pos_p

[['ADJ', 'amod', 'NOUN'],
 ['VERB', 'conj', 'VERB'],
 ['VERB', 'ROOT', 'VERB'],
 ['VERB', 'amod', 'NOUN'],
 ['VERB', 'xcomp', 'VERB'],
 ['ADV', 'advmod', 'ADV'],
 ['ADV', 'advmod', 'VERB'],
 ['VERB', 'ccomp', 'VERB'],
 ['ADV', 'advmod', 'ADJ'],
 ['ADJ', 'conj', 'ADJ'],
 ['ADV', 'conj', 'ADJ'],
 ['VERB', 'conj', 'ADJ'],
 ['VERB', 'relcl', 'PROPN'],
 ['VERB', 'acl', 'PROPN'],
 ['VERB', 'pcomp', 'ADP'],
 ['VERB', 'relcl', 'NOUN'],
 ['ADJ', 'acomp', 'VERB'],
 ['VERB', 'acl', 'NOUN'],
 ['VERB', 'advcl', 'VERB'],
 ['ADJ', 'amod', 'ADV'],
 ['ADV', 'neg', 'ADV'],
 ['ADV', 'conj', 'NOUN'],
 ['VERB', 'ccomp', 'NOUN'],
 ['VERB', 'csubj', 'AUX'],
 ['ADJ', 'ccomp', 'VERB'],
 ['VERB', 'compound', 'NOUN'],
 ['VERB', 'amod', 'SPACE'],
 ['ADJ', 'dobj', 'VERB'],
 ['ADV', 'advmod', 'AUX']]

In [12]:
ans_dict_pos = {}
for x in TEST_DATA:
    sent = x[0].replace('IMPLIED_BASE', '')
    doc = nlp(sent)
    tmp = []
    
    for t in doc:
        head = t.head.text
        for v in pos_p:
            if [t.pos_, t.dep_, t.head.pos_] == v:
                #if t.text not in get_ent_from_sent(x, "base") and t.text.isalpha():
                tmp.append(t.text)

    ans_dict_pos[sent] = tmp
print(ans_dict_pos)

{'Around the wine-growing world, smart producers have contemplated and experimented with adaptations, not only to hotter summers, but also to warmer winters, droughts and the sort of unexpected, sometimes violent events that stem from climate change: freak hailstorms, spring frosts, flooding and forest fires, just to name a few.  ': ['growing', 'smart', 'contemplated', 'experimented', 'hotter', 'warmer', 'unexpected', 'sometimes', 'violent', 'stem', 'just', 'name', 'few'], '“Extreme heat, powerful storms and floods, year-round wildfires, droughts, and other climate-related events have already caused thousands of deaths and displaced tens of thousands of people in the U.S. from their homes, with significant personal loss and mental health impacts especially for first responders and children,” the letter warns.  ': ['Extreme', 'powerful', 'other', 'related', 'already', 'caused', 'displaced', 'significant', 'personal', 'mental', 'first', 'warns'], 'Historical warming has likely increased 

In [13]:
"""
s = "Antarctic glaciers have been melting at an accelerating pace over the past four decades thanks to an influx of warm ocean water — a startling new finding that researchers say could mean sea levels are poised to rise more quickly than predicted in coming decades. IMPLIED_BASE IMPLIED_BASE"
words = ['Antarctic', 'melting', 'accelerating', 'past', 'warm', 'startling', 'new', 'say', 'mean', 'poised', 'rise', 'more', 'quickly', 'predicted', 'coming']

full_word = []

for w in range(0, len(words)):
    for v in range(w, len(words)):
        new_word = " ".join(words[w:v])
        if new_word in s and len(new_word.split(" ")) > 1:
            full_word.append(new_word)  

print(full_word)
"""


'\ns = "Antarctic glaciers have been melting at an accelerating pace over the past four decades thanks to an influx of warm ocean water — a startling new finding that researchers say could mean sea levels are poised to rise more quickly than predicted in coming decades. IMPLIED_BASE IMPLIED_BASE"\nwords = [\'Antarctic\', \'melting\', \'accelerating\', \'past\', \'warm\', \'startling\', \'new\', \'say\', \'mean\', \'poised\', \'rise\', \'more\', \'quickly\', \'predicted\', \'coming\']\n\nfull_word = []\n\nfor w in range(0, len(words)):\n    for v in range(w, len(words)):\n        new_word = " ".join(words[w:v])\n        if new_word in s and len(new_word.split(" ")) > 1:\n            full_word.append(new_word)  \n\nprint(full_word)\n'

# Calculate results

In [14]:
def del_from_list(del_arr, main_arr):
    for x in del_arr:
        if x in main_arr:
            main_arr.remove(x)
    return main_arr

def calc_res(user_ans, answers):    
    final_res = []
    grade_ent_res = {}
    tmp_del = []
    user = "machine"

    true_positive = 0
    false_negative = 0
    false_positive = 0
    complete = 0
    ent = "change_direction"
    grade_ent_res[user] = {}
    if ent not in grade_ent_res[user]:
        grade_ent_res[user][ent] = {"tp": 0, "fp": 0, "fn": 0, "complete": 0}

    if answers == user_ans:
        # 1. when both arrays are exactly the same
        for ans in user_ans:
            tmp_del.append(ans)
            true_positive += 1
            complete += 1
        answers = del_from_list(tmp_del, answers)
        user_ans = del_from_list(tmp_del, user_ans)
        tmp_del = []
    else:
        # 2. when some elements are equal in both user and actual ans
        for ans in user_ans:
            if ans in answers:
                tmp_del.append(ans)
                true_positive += 1
                complete += 1
        answers = del_from_list(tmp_del, answers)
        user_ans = del_from_list(tmp_del, user_ans)
        tmp_del = []

        # 3. when user ans is partially correct
        # - if user ans is contained in actual ans
        if user_ans:
            for ans in user_ans:
                substring_user = [string for string in answers if ans in string]
                if substring_user:
                    tmp_del.append(ans)
                    true_positive += 1
                    false_positive += 1
        answers = del_from_list(tmp_del, answers)
        user_ans = del_from_list(tmp_del, user_ans)
        tmp_del = []

        # - if actual ans is contained in user ans
        if answers:
            for ans in answers:
                substring_ans = [string for string in user_ans if ans in string]
                if substring_ans:
                    tmp_del.append(ans)
                    true_positive += 1
                    false_positive += 1
        answers = del_from_list(tmp_del, answers)
        user_ans = del_from_list(tmp_del, user_ans)
        tmp_del = []

        # 4. when actual ans is missing from user ans
        if answers and user_ans:
            set_difference_actual = set(answers) - set(user_ans)
            for ans in list(set_difference_actual):
                tmp_del.append(ans)
                false_negative += 1
        answers = del_from_list(tmp_del, answers)
        tmp_del = []

        # 5. when user ans is missing from actual ans
        if answers and user_ans:
            set_difference_user = set(user_ans) - set(answers)
            for ans in list(set_difference_user):
                tmp_del.append(ans)
                false_positive += 1
        user_ans = del_from_list(tmp_del, user_ans)
        tmp_del = []

    #print("tp: {}, fp: {}, fn: {}".format(true_positive, false_positive, false_negative))
    grade_ent_res[user][ent]["tp"] += true_positive
    grade_ent_res[user][ent]["fp"] += false_positive
    grade_ent_res[user][ent]["fn"] += false_negative
    grade_ent_res[user][ent]["complete"] += complete

    return grade_ent_res

In [15]:
label_check = "change_direction"
result = os.path.join(output_dir, "spacy_result_change_dir.csv")
true_pos = 0
all_cd = 0
correct_cd = 0
false_pos = 0
false_neg = 0

headers = ["text", "predicted_ent", "actual_ent"]
final_res = []
final_res.append(headers)
for sent in TEST_DATA:
    #res = ans_dict_pos_cd[sent[0]]
    res = ans_dict_pos[sent[0].replace('IMPLIED_BASE', '')]
    actual_ent = []
    for tokens in sent[1]['entities']:
        if tokens[2] == label_check:
            word = sent[0][tokens[0]:tokens[1]]
            if len(word.split(" ")) > 0:
                actual_ent.extend(word.split(" "))
            else:
                actual_ent.append(word)
    final_res.append([sent[0], res, actual_ent])
    
with open(result, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(final_res)

print("created result file", result)

created result file /Users/ash/Desktop/custom_entity_extractor_spacy_change_dir/spacy_result_change_dir.csv


In [16]:
true_pos = 0
all_ents = 0
correct_ents = 0
false_pos = 0
false_neg = 0

for sent in TEST_DATA:
    #res = ans_dict_pos_cd[sent[0]]
    res = ans_dict_pos[sent[0].replace('IMPLIED_BASE', '')]
    actual_ent = []
    for tokens in sent[1]['entities']:
        if tokens[2] == label_check:
            word = sent[0][tokens[0]:tokens[1]]
            if len(word.split(" ")) > 0:
                actual_ent.extend(word.split(" "))
            else:
                actual_ent.append(word)
    all_ents += len(actual_ent)
    res_dict = calc_res(res, actual_ent)
    true_pos += res_dict["machine"]['change_direction']['tp']
    false_pos += res_dict["machine"]['change_direction']['fp']
    false_neg += res_dict["machine"]['change_direction']['fn']
    correct_ents += res_dict["machine"]['change_direction']['complete']
    
acc = ((correct_ents*100)/all_ents)
print("False pos: ", false_pos)
print("False neg: ", false_neg)
print("Marked {} out of {} cd correctly. Accuracy: {}".format(correct_ents, all_ents, acc))

False pos:  7
False neg:  45
Marked 125 out of 175 cd correctly. Accuracy: 71.42857142857143
