In [1]:
import pandas as pd
from pathlib import Path
import spacy
import dgl
import torch
import numpy as np

Using backend: pytorch


In [2]:
data_csv_file = Path("../Dataset/mohler/mohler_formatted.csv")
column_to_keep = ['id', 'question', 'desired_answer', 'student_answer', 'score_me', 'score_other', 'score_avg']
df = pd.read_csv(data_csv_file, delimiter=",", encoding='utf8', usecols=column_to_keep)

In [3]:
df.head()

Unnamed: 0,id,question,desired_answer,student_answer,score_me,score_other,score_avg
0,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,High risk problems are address in the prototyp...,4.0,3.0,3.5
1,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,To simulate portions of the desired final prod...,5.0,5.0,5.0
2,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,A prototype program simulates the behaviors of...,5.0,3.0,4.0
3,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,Defined in the Specification phase a prototype...,5.0,5.0,5.0
4,1.1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,It is used to let the users have a first idea ...,3.0,3.0,3.0


In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
sample_text = df['student_answer'].iloc[0]
sample_text

'High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.'

In [6]:
sample_doc = nlp(sample_text)
sample_doc

High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.

In [7]:
tok_text = [token.text for token in sample_doc]
print (tok_text, 'len -> ', len(tok_text))

['High', 'risk', 'problems', 'are', 'address', 'in', 'the', 'prototype', 'program', 'to', 'make', 'sure', 'that', 'the', 'program', 'is', 'feasible', '.', ' ', 'A', 'prototype', 'may', 'also', 'be', 'used', 'to', 'show', 'a', 'company', 'that', 'the', 'software', 'can', 'be', 'possibly', 'programmed', '.'] len ->  37


In [8]:
with open('../assets/txt/dependency_tags.txt', 'r') as f:
    dep_map = f.read().split("\n")
print(dep_map)

['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']


In [9]:
src_nodes = []
dst_nodes = []
edge_type = []
edge_type_id = []
node_token_id = []
node_text = []

for token in sample_doc:
    if token.has_vector:
        node_token_id.append(nlp.vocab.vectors.find(key=token.norm))
    else:
        node_token_id.append(-1)
    node_text.append(token.text)
    if spacy.explain(token.dep_) is None:
        # print(token, spacy.explain(token.dep_))
        continue
    src_nodes.append(token.head.i)
    dst_nodes.append(token.i)
    edge_type_id.append(dep_map.index(token.dep_))
    edge_type.append(token.dep_) 
    print(f"{token.text:{8}} {token.i} {token.dep_+' =>':{10}}  {token.head.text:{9}} {token.head.i} {spacy.explain(token.dep_)} ")

High     0 amod =>     problems  2 adjectival modifier 
risk     1 compound =>  problems  2 compound 
problems 2 nsubj =>    are       3 nominal subject 
address  4 attr =>     are       3 attribute 
in       5 prep =>     address   4 prepositional modifier 
the      6 det =>      program   8 determiner 
prototype 7 compound =>  program   8 compound 
program  8 pobj =>     in        5 object of preposition 
to       9 aux =>      make      10 auxiliary 
make     10 advcl =>    are       3 adverbial clause modifier 
sure     11 ccomp =>    make      10 clausal complement 
that     12 mark =>     is        15 marker 
the      13 det =>      program   14 determiner 
program  14 nsubj =>    is        15 nominal subject 
is       15 ccomp =>    sure      11 clausal complement 
feasible 16 acomp =>    is        15 adjectival complement 
.        17 punct =>    are       3 punctuation 
A        19 det =>      prototype 20 determiner 
prototype 20 nsubjpass =>  used      24 nominal subject (pa

In [10]:
dgl_graph = dgl.graph((src_nodes, dst_nodes), num_nodes=len(sample_doc))

In [11]:
dgl_graph.ndata['tokens'] = torch.from_numpy(np.array(node_token_id, dtype=np.int16))
dgl_graph.edata['type'] = torch.from_numpy(np.array(edge_type_id, dtype=np.int8))

In [12]:
dgl_graph.edata['type']

tensor([ 6, 14, 29,  8, 39, 20, 14, 35,  9,  3, 13, 24, 20, 29, 13,  2, 41, 20,
        30,  9,  4, 10,  9, 44, 20, 21, 21, 20, 30,  9, 10,  4, 43, 41],
       dtype=torch.int8)

In [13]:
dgl_graph.node_text = node_text
dgl_graph.edge_class = edge_type
dgl_graph.score = 4

In [14]:
dgl_graph.score

4

In [15]:
dgl.save_graphs("./data.bin", [dgl_graph])
dgl.data.utils.save_info('./data_info.pkl', {'texts': [node_text], 'edge_type': [edge_type], 'score': [4]})

In [16]:
k = dgl.load_graphs("./data.bin")
k_info = dgl.data.utils.load_info('./data_info.pkl')

In [17]:
k[0][0].edata

{'type': tensor([ 6, 14, 29,  8, 39, 20, 14, 35,  9,  3, 13, 24, 20, 29, 13,  2, 41, 20,
        30,  9,  4, 10,  9, 44, 20, 21, 21, 20, 30,  9, 10,  4, 43, 41],
       dtype=torch.int8)}