In [1]:
import srsly
import numpy as np
import spacy
import pandas as pd

!pip install benepar
#!python -m spacy download en_core_web_md
#!python -m spacy download en
nlp = spacy.load('en')

import benepar
benepar.download('benepar_en3')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

!pip install node2vec
#!pip install xgboost

!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

!pip install stellargraph[demos]

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.2 MB/s  eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125937 sha256=c3e793ae5a17572605874772f339a2824abe5caaf682d00e233b9eac5f9f7b5a
  Stored in directory: /home/azureuser/.cache/pip/wheels/5e/6f/8c/d88aec621f3f542d26fac0342bef5e693335d125f4e54aeffe
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.2.0
    Uninstalling sentence-transformers-2.2.0:
      Successfully uninstalled sentence-transformers-2.2.0
Successfully installed sentence-transformers-2.2.2


[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [2]:
rel_label = ["Contributes_To"]
ents = ["base", "change_direction", "predicate", "aspect_changing"]

In [3]:
def prep_rel_data(rel_labels):
    
    file_name_answers = "checkin_answers"
    file_path_answers = file_name_answers + ".jsonl"
    data = srsly.read_jsonl(file_path_answers)
    final_rel_sent = []

    for entry in data:
        if "text" in entry:
            text = entry["text"]
        label_arr = []
        label_dict = {}
        if entry['answer'] == "accept":
            if entry['_session_id'] == "main_3_per_cluster-Kameron":
                for relation in entry['relations']:
                    if ("label" in relation) and ("head_span" in relation) and ("child_span" in relation):
                        if relation["label"] in rel_labels:
                            head_span_start = relation["head_span"]["start"]
                            head_span_end = relation["head_span"]["end"]
                            child_span_start = relation["child_span"]["start"]
                            child_span_end = relation["child_span"]["end"]
                            head_word = text[head_span_start:head_span_end]
                            child_word = text[child_span_start:child_span_end]
                            tmp_dict = {"head": head_word, "child": child_word, "label": relation["label"]}
                            label_arr.append(tmp_dict)
                if len(label_arr) > 0:
                    label_tup = (text, {"entities": label_arr})
                    final_rel_sent.append(label_tup)
                else:
                    pass
                    #print(text)
    return final_rel_sent
                    
final_rel_sent = prep_rel_data(rel_label)
#print(final_rel_sent)

In [4]:
def prep_ent_data(ents):
    
    file_name_answers = "checkin_answers"
    file_path_answers = file_name_answers + ".jsonl"
    data = srsly.read_jsonl(file_path_answers)
    label_dict = {}

    for entry in data:
        if "text" in entry:
            text = entry["text"]
        base_arr = []
        cd_arr = []
        pred_arr = []
        ac_arr = []
        if entry['answer'] == "accept":
            if entry['_session_id'] == "main_3_per_cluster-Kameron":
                for relation in entry['spans']:
                    if ("label" in relation) and ("start" in relation) and ("end" in relation):
                        child_span_start = relation["start"]
                        child_span_end = relation["end"]
                        word = text[child_span_start:child_span_end]
                        if relation["label"] in ents:
                            if relation["label"] == "base":
                                base_arr.append(word)
                            elif relation["label"] == "change_direction":
                                cd_arr.append(word)
                            elif relation["label"] == "predicate":
                                pred_arr.append(word)
                            elif relation["label"] == "aspect_changing":
                                ac_arr.append(word)

                label_dict[text] = {"base": base_arr, "change_direction": cd_arr, "predicate": pred_arr, "aspect_changing": ac_arr}
    return label_dict

final_sent_ent = prep_ent_data(ents)
#print(final_sent_ent)

In [5]:
def get_direction_counter():
    left_to_right = 0
    right_to_left = 0
    for x in final_rel_sent:
        sent = x[0]
        for y in x[1]['entities']:
            head = y['head']
            tail = y['child']
            start_index = sent.find(head)
            end_index = sent.find(tail)
            if head != "IMPLIED_BASE" and tail != "IMPLIED_BASE":
                if start_index < end_index:
                    right_to_left += 1
                else:
                    left_to_right += 1            
    return {"left_to_right": left_to_right, "right_to_left": right_to_left}

def get_num_unique_sent():
    sentences = set()
    for x in final_rel_sent:
        sent = x[0]
        sentences.add(sent)
    return len(sentences)

In [6]:
print(get_direction_counter())
print(get_num_unique_sent())

{'left_to_right': 246, 'right_to_left': 447}
305


In [7]:
from nltk.tree import Tree
import networkx as nx
from collections import defaultdict
import itertools

def convertToGraph(edges):
    #G = defaultdict(list)
    #G = nx.Graph()
    #for a, b in tree_to_edges(edges):
        #G[a] += [b]
        #G[b] += [a]
    #    G.add_node(a)
    #    G.add_node(b)
    #    G.add_edge(a, b)
    #return G
    
    G = nx.Graph()
    for node_tuple in tree_to_edges(edges):
        G.add_edges_from(itertools.product(node_tuple, node_tuple))

    G.remove_edges_from(nx.selfloop_edges(G))

    return G
    


def get_edges(tree, i):
    from_str = f"{tree.label()}{i}"
    children = [f"{child.label()} {child.leaves()[0]}" for child in tree if isinstance(child, Tree) and child.height() == 2]
    children.extend([f"{child.label()}{i+1}" for child in tree if isinstance(child, Tree) and child.height() > 2])
    return [(from_str, child) for child in children]

def tree_to_edges(tree):
    height = 0
    rv = []
    to_check = [tree]
    while to_check:
        tree_to_check = to_check.pop(0)
        rv.extend(get_edges(tree_to_check, height))
        height += 1
        to_check.extend([child for child in tree_to_check if isinstance(child, Tree) and child.height() > 2])
    return rv

def generate_graph(tmp_ans):
    #t = Tree.fromstring(tmp_ans._.parse_string)
    t = Tree.fromstring(tmp_ans)
    #final_graph = convertToGraph(tree_to_edges(t))
    final_graph = convertToGraph(t)
    return final_graph

"""
tmp_sent = 'As freshwater lakes get hotter in the summer that leads to more amoebae in the water and increased human risk said Sonia Altizer an associate professor of ecology at the University of Georgia'
tmp_doc = nlp(tmp_sent)        
tmp_ans = list(tmp_doc.sents)[0]
parse = tmp_ans._.parse_string
final_graph = generate_graph(parse)
print(final_graph)
"""

"\ntmp_sent = 'As freshwater lakes get hotter in the summer that leads to more amoebae in the water and increased human risk said Sonia Altizer an associate professor of ecology at the University of Georgia'\ntmp_doc = nlp(tmp_sent)        \ntmp_ans = list(tmp_doc.sents)[0]\nparse = tmp_ans._.parse_string\nfinal_graph = generate_graph(parse)\nprint(final_graph)\n"

In [8]:
from node2vec import Node2Vec
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

def generate_embeddings(final_graph):
    node2vec = Node2Vec(final_graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return model.wv.vectors

def generate_embeddings_new(final_graph):
    G = StellarGraph.from_networkx(final_graph)
    rw = BiasedRandomWalk(G)

    walks = rw.run(
        nodes=list(G.nodes()),  # root nodes
        length=100,  # maximum length of a random walk
        n=10,  # number of random walks per root node
        p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
        q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
    )
    #print("Number of random walks: {}".format(len(walks)))

    str_walks = [[str(n) for n in walk] for walk in walks]
    model = Word2Vec(str_walks, vector_size=128, window=5, min_count=0, sg=1, workers=2, epochs=1)

    # Retrieve node embeddings and corresponding subjects
    node_ids = model.wv.index_to_key  # list of node IDs
    node_embeddings = (
        model.wv.vectors
    )  # numpy.ndarray of size number of nodes times embeddings dimensionality
    return node_embeddings

"""
# Create a graph
graph = nx.fast_gnp_random_graph(n=100, p=0.5)

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs

# Embed nodes
#model = node2vec.fit(window=10, min_count=1, batch_words=2)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

# Look for most similar nodes
model.wv.most_similar('2')  # Output node names are always strings

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

# Embed edges using Hadamard method
from node2vec.edges import HadamardEmbedder

edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

# Look for embeddings on the fly - here we pass normal tuples
edges_embs[('1', '2')]
''' OUTPUT
array([ 5.75068220e-03, -1.10937878e-02,  3.76693785e-01,  2.69105062e-02,
       ... ... ....
       ..................................................................],
      dtype=float32)
'''

# Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks
edges_kv = edges_embs.as_keyed_vectors()

# Look for most similar edges - this time tuples must be sorted and as str
edges_kv.most_similar(str(('1', '2')))

# Save embeddings for later use
edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)
"""

"\n# Create a graph\ngraph = nx.fast_gnp_random_graph(n=100, p=0.5)\n\n# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**\nnode2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs\n\n# Embed nodes\n#model = node2vec.fit(window=10, min_count=1, batch_words=2)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)\n\n# Look for most similar nodes\nmodel.wv.most_similar('2')  # Output node names are always strings\n\n# Save embeddings for later use\nmodel.wv.save_word2vec_format(EMBEDDING_FILENAME)\n\n# Save model for later use\nmodel.save(EMBEDDING_MODEL_FILENAME)\n\n# Embed edges using Hadamard method\nfrom node2vec.edges import HadamardEmbedder\n\nedges_embs = HadamardEmbedder(keyed_vectors=model.wv)\n\n# Look for embeddings on the fly - here we pass normal tuples\nedges_embs[('1', '2')]\n''' OUTPUT\

In [2]:
def get_dep_tree(new_sent):
    dep_tree = {}
    doc = nlp(new_sent)        
    sent = list(doc.sents)[0]
    #dep_tree[new_sent] = sent._.parse_string.replace("(", "[").replace(")", "]")
    dep_tree[new_sent] = sent._.parse_string
    return dep_tree

In [7]:
tmp_sent = 'As freshwater lakes get hotter in the summer that leads to more amoebae in the water and increased human risk said Sonia Altizer an associate professor of ecology at the University of Georgia'
res = get_dep_tree(tmp_sent)[tmp_sent]
print(res.replace("(", "[").replace(")", "]"))

[S [SBAR [IN As] [S [NP [NN freshwater] [NNS lakes]] [VP [VBP get] [ADJP [JJR hotter]] [PP [IN in] [NP [DT the] [NN summer]]]]]] [NP [WDT that]] [VP [VBZ leads] [PP [PP [IN to] [NP [NP [NP [JJR more] [NNS amoebae]] [PP [IN in] [NP [DT the] [NN water]]]] [CC and] [NP [VBN increased] [JJ human] [NN risk]]]] [PP [VP [VBD said]] [NP [NP [NNP Sonia] [NNP Altizer]] [NP [NP [DT an] [JJ associate] [NN professor]] [PP [IN of] [NP [NN ecology]]] [PP [IN at] [NP [NP [DT the] [NNP University]] [PP [IN of] [NP [NNP Georgia]]]]]]]]]]]


In [None]:
csv = []
#features = ["sent", "base1", "base2", "all_bases", "all_cd", "all_pred", "all_ac", "left_to_right"]
#features = ["sent", "base1", "base2", "all_bases", "all_cd", "all_pred", "dep_tree", "left_to_right"]
features = ["sent", "base1", "base2", "dep_tree", "left_to_right"]
csv.append(features)

for x in final_rel_sent:
    print(final_rel_sent.index(x))
    sent = x[0]
    for y in x[1]['entities']:
        if y["head"] != "IMPLIED_BASE" and y["child"] != "IMPLIED_BASE":
            start_index = sent.find(y["head"])
            end_index = sent.find(y["child"])
            if start_index < end_index:
                left_to_right = 0
            else:
                left_to_right = 1
                
            new_sent = sent
            new_sent = new_sent.replace(y["head"], "BASE")
            new_sent = new_sent.replace(y["child"], "BASE")

            #tmp = [sent, y["head"], y["child"], ", ".join(final_sent_ent[sent]["base"]), ", ".join(final_sent_ent[sent]["change_direction"]), 
            #    ", ".join(final_sent_ent[sent]["predicate"]), ", ".join(final_sent_ent[sent]["aspect_changing"]), left_to_right]
            #tmp = [sent, y["head"], y["child"], ", ".join(final_sent_ent[sent]["base"]), ", ".join(final_sent_ent[sent]["change_direction"]), 
            #    ", ".join(final_sent_ent[sent]["predicate"]), left_to_right]
            
            # passing in dep tree/sentence grammar as a feature
            #tmp = [sent, y["head"], y["child"], ", ".join(final_sent_ent[sent]["base"]), ", ".join(final_sent_ent[sent]["change_direction"]), 
            #    ", ".join(final_sent_ent[sent]["predicate"]), dep_tree[sent], left_to_right]

            # swapping the actual words base to "BASE" to understand if the model is actually learning the grammar or not
            #tmp = [new_sent, "BASE", "BASE", ", ".join(final_sent_ent[sent]["base"]), ", ".join(final_sent_ent[sent]["change_direction"]), 
            #    ", ".join(final_sent_ent[sent]["predicate"]), left_to_right]
            #tmp = [new_sent, "BASE", "BASE", ", ".join(final_sent_ent[sent]["change_direction"]), 
            #    ", ".join(final_sent_ent[sent]["predicate"]), left_to_right]

            dep_tree = get_dep_tree(new_sent)[new_sent]
            #graph_generated = generate_graph(dep_tree)
            #g_embeddings = generate_embeddings_new(graph_generated)

            #sentence_embeddings_model = sbert_model.encode(new_sent)
            #s_embeddings = list(sentence_embeddings_model)
            tmp = [new_sent, "BASE", "BASE", dep_tree, left_to_right]
            
            #for e in g_embeddings:
            #    tmp = [s_embeddings, "BASE", "BASE", e, left_to_right]
            #    csv.append(tmp)
            csv.append(tmp)
            tmp = []    
        
print(len(csv))

In [11]:
len(csv)

694

In [12]:
df = pd.DataFrame(i for i in csv)
df.drop(0, axis=0, inplace=True)
df.columns = csv[0]
df['left_to_right'] = df['left_to_right'].astype(int)
#df.to_csv('data.csv')

In [13]:
df.head()

Unnamed: 0,sent,base1,base2,dep_tree,left_to_right
1,"[-0.3926697, 0.29174235, 0.34444395, 0.1115106...",BASE,BASE,(S (NP (DT These) (NNS risks)) (VP (VBP are) (...,1
2,"[-0.4879627, 0.20637694, 0.35121828, 0.1713416...",BASE,BASE,(S (NP (DT These) (NNS risks)) (VP (VBP are) (...,1
3,"[-0.2728444, 0.3277352, 0.4151958, 0.036221836...",BASE,BASE,(S (NP (DT These) (NNS risks)) (VP (VBP are) (...,1
4,"[-0.2765118, 0.2526166, 0.36175925, 0.10105675...",BASE,BASE,(S (NP (DT These) (NNS risks)) (VP (VBP are) (...,1
5,"[-0.2971289, 0.28096542, 0.47723004, 0.0929106...",BASE,BASE,(S (NP (DT These) (NNS risks)) (VP (VBP are) (...,1


In [16]:
"""
square_edges = pd.DataFrame(
    {"source": df.dep_tree, "target": df.left_to_right}
)
square = StellarGraph(edges=square_edges)
print(square.info())
"""

In [None]:
#df["sent"] = df["sent"].astype('category')
df["base1"] = df["base1"].astype('category')
df["base2"] = df["base2"].astype('category')
#df["all_bases"] = df["all_bases"].astype('category')
#df["all_cd"] = df["all_cd"].astype('category')
#df["all_pred"] = df["all_pred"].astype('category')
#df["dep_tree"] = df["dep_tree"][:0,].astype('category')
#df["all_ac"] = df["all_ac"].astype('category')

# Second, replace the strings with their code values.
#df["sent"] = df["sent"].cat.codes
df["base1"] = df["base1"].cat.codes
df["base2"] = df["base2"].cat.codes
#df["all_bases"] = df["all_bases"].cat.codes
#df["all_cd"] = df["all_cd"].cat.codes
#df["all_pred"] = df["all_pred"].cat.codes
#df["dep_tree"] = df["dep_tree"].cat.codes
#df["all_ac"] = df["all_ac"].cat.codes

# Display the table--notice how the above columns are all integers now.
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

feature_cols = ["sent", "dep_tree"]
#feature_cols = ["sent", "base1", "base2", "all_bases", "all_cd", "dep_tree", "all_pred"]
#feature_cols = ["sent", "base1", "base2", "all_bases", "all_cd", "all_pred", "all_ac"]

X = df[feature_cols]
y = df.left_to_right

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)
print("Training size: {}".format(len(X_train)))
print("Testing size: {}".format(len(X_test)))

In [None]:
#X_train["dep_tree"].str.len().max()
new_list = []
padded_list = []
for x in range(0, len(X_train["dep_tree"])):
    #print(np.asarray(X_train["dep_tree"].iloc[x]).flatten())
    new_list.append(np.asarray(X_train["dep_tree"].iloc[x]).flatten())

max_len = max(len(x) for x in new_list)

for n in new_list:
    pad = int(max_len - len(n))
    if pad > 0:
        new_arr = [0.00]*pad
        #print(np.concatenate(n, np.array(new_arr)))
        n = np.concatenate((n, np.array(new_arr)))
    padded_list.append(n)

X_train["dep_tree_flatten"] = padded_list

In [None]:
X_train.drop('dep_tree', axis=1, inplace=True)

X_train

In [None]:

import xgboost as xgb

#new_ser = pd.Series(X_train["sent"])
#new_new = np.stack(new_ser)
#print(new_new)

new_dep_ser = pd.Series(X_train["dep_tree_flatten"])
new_dep_new = np.stack(new_dep_ser)
print(type(new_dep_ser))

#new_dep_new = np.column_stack(new_dep_ser)
#print(new_dep_new)

#print(type(new_new))
my_dmat = xgb.DMatrix(np.asarray(new_dep_new))
my_dmat
X_train["dep_dmat"] = my_dmat

In [None]:
#np.vstack(X_train["dep_tree"])
#np.array(X_train["dep_tree_flatten"], dtype=object).reshape((1,-1))
X_train

In [None]:
xgb.DMatrix(X_train, y_train)

In [None]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Create an instance of the classifier
#model = XGBClassifier(early_stopping_rounds = 10, n_estimators=300, learning_rate = 0.1)
model = XGBClassifier(early_stopping_rounds = 10, n_estimators=300, learning_rate = 0.1, max_depth=5, use_label_encoder=False)
#model = XGBClassifier(use_label_encoder=False)
# Train it on the training set.
#model.fit(X_train, y_train)
#model.fit(np.array(X_train["dep_dmat"]).reshape((1, -1)), y_train)
model.fit(np.array(my_dmat).reshape(1, -1), y_train)

#model.fit(np.array(X_train["dep_tree_flatten"], dtype=object).reshape((1, -1)), y_train)

# Use the trained model to predict the labels for the test set.
predictions = model.predict(X_test["dep_tree_flatten"])
print(predictions)
# Calculate the F1 score.
f1 = f1_score(y_true = y_test,
              y_pred = predictions)

print('F1: {}'.format(f1))

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, predictions)
cnf_matrix

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

In [None]:
predictions

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot

# plot feature importance
plot_importance(model)
pyplot.show()

In [None]:
feat_gains = model.get_booster().get_score(importance_type="gain")

pyplot.bar(feat_gains.keys(), feat_gains.values())
pyplot.xticks(rotation = 90)