In [1]:
%load_ext autoreload
%autoreload 2

# EDA

In [2]:
import json
import pprint
from collections import Counter

In [3]:
with open("negation/cdd.epe", "r") as stream:
    sdev = stream.readlines()
with open("negation/cdt.epe", "r") as stream:
    strain = stream.readlines()
with open("negation/cde.epe", "r") as stream:
    seval = stream.readlines()

In [6]:
c_id = Counter()
c_negs = Counter()

for i in sdev:
    t = json.loads(i)
    c_id[t["id"]+t["source"]] += 1
    c_negs[t["negations"]] += 1


In [8]:
len(c_id)

787

In [15]:
sum(c_negs.values())

787

In [4]:
json.loads(sdev[3])

{'id': '3',
 'source': 'wisteria01',
 'negations': 1,
 'nodes': [{'id': '0',
   'form': 'He',
   'properties': {'lemma': 'He', 'xpos': 'PRP'},
   'negation': [{'id': 0, 'scope': 'He'}]},
  {'id': '1',
   'form': 'made',
   'properties': {'lemma': 'make', 'xpos': 'VBD'},
   'negation': [{'id': 0, 'scope': 'made', 'event': 'made'}]},
  {'id': '2',
   'form': 'no',
   'properties': {'lemma': 'no', 'xpos': 'DT'},
   'negation': [{'id': 0, 'cue': 'no'}]},
  {'id': '3',
   'form': 'remark',
   'properties': {'lemma': 'remark', 'xpos': 'NN'},
   'negation': [{'id': 0, 'scope': 'remark', 'event': 'remark'}]},
  {'id': '4',
   'form': ',',
   'properties': {'lemma': ',', 'xpos': ','},
   'negation': [{'id': 0}]},
  {'id': '5',
   'form': 'but',
   'properties': {'lemma': 'but', 'xpos': 'CC'},
   'negation': [{'id': 0}]},
  {'id': '6',
   'form': 'the',
   'properties': {'lemma': 'the', 'xpos': 'DT'},
   'negation': [{'id': 0}]},
  {'id': '7',
   'form': 'matter',
   'properties': {'lemma': 'mat

### JSON structure:

In [5]:
def extractKeys(dic, s, parent=""):
    if isinstance(dic, dict):
        for key in dic.keys():
            keylabel = ".".join([parent, key]) if parent != "" else key
            s.add(keylabel)
            extractKeys(dic[key], s, keylabel)
    elif isinstance(dic, list):
        for d in dic:
            extractKeys(d, s, parent)
        
    return s

Development set JSON structure:

In [6]:
devkeys = set()
for i in sdev:
    devkeys = extractKeys(json.loads(i), devkeys)

pprint.pprint(devkeys)

{'id',
 'negations',
 'nodes',
 'nodes.form',
 'nodes.id',
 'nodes.negation',
 'nodes.negation.cue',
 'nodes.negation.event',
 'nodes.negation.id',
 'nodes.negation.scope',
 'nodes.properties',
 'nodes.properties.lemma',
 'nodes.properties.xpos',
 'source'}


Training JSON structure:

In [7]:
trainkeys = set()
for i in strain:
    trainkeys = extractKeys(json.loads(i), trainkeys)

pprint.pprint(trainkeys)

{'id',
 'negations',
 'nodes',
 'nodes.form',
 'nodes.id',
 'nodes.negation',
 'nodes.negation.cue',
 'nodes.negation.event',
 'nodes.negation.id',
 'nodes.negation.scope',
 'nodes.properties',
 'nodes.properties.lemma',
 'nodes.properties.xpos',
 'source'}


Eval set JSON structure:

In [8]:
evalkeys = set()
for i in seval:
    evalkeys = extractKeys(json.loads(i), evalkeys)

pprint.pprint(evalkeys)

{'id',
 'negations',
 'nodes',
 'nodes.form',
 'nodes.id',
 'nodes.negation',
 'nodes.negation.cue',
 'nodes.negation.event',
 'nodes.negation.id',
 'nodes.negation.scope',
 'nodes.properties',
 'nodes.properties.lemma',
 'nodes.properties.xpos',
 'source'}


## Data Processing

Each sentence contains a JSON object with the following structure:

* Sentence:
    - `id` = the sentence ID
    - `source` = source for the sentence
    - `negations` = number of negations. In our case, this is either 0 or 1
    - `nodes` -> List with a JSON object for each token of the sentence
        - For each word:
            - `id` = the index of the word
            - `form` = the token
            - `properties` -> JSON containing:
                - `lemma` = the lemma of the token
                - `xpos` = the POS-tag for the token
            - (`negation`) -> List of JSON(s) of the following format:
                - (`id`) = id of the negation cue the token belongs to
                - (`cue`) = the token making up the negation cue (either the same as `form`, 
                    or a sub-token of `form` in the case of affix cues)
                - (`scope`) = the token, same as `form` or a sub-token of `form`, indicated to be inside the scope
                - (`event`) = not of interest

The fields within parentheses are marked so because not all sentences contain them or contain only some of them.
                
Some identified `negation` field occurrences are:
1. containing `id`
2. containing `id` and `cue`
3. containing `id` and `scope`
4. containing `id` and `cue`
5. containing `id`, `scope` and `event`
6. containing `id`, `cue`, `scope` and `event`

In addition, sentences without a negation cue don't have the `negation` field defined for the tokens.

The `event` field is not of interest for this analysis, and will not be used for the labeling rules.

Since sentences with more than one negation cue have been 'multiplied out', the `id` field is not of interest. 
And in the case of affix-cues (for example *infrequent*), the splitting of the token into a sub-token cue and a sub-token belonging to the negation scope (*infrequent* -> *in* = `cue`, and *frequent* = `scope`) is not of immediate interest here.

Based on this, the following data preparation process is proposed:

**Labels:**
* `"T"` -> True, token is outside the negation scope
* `"F"` -> False, token is inside the negation scope
* `"C"` -> Cue, token is a negation cue

**Labelling process:**
* For each token in a sentence:
    - Has the `negation` field?
        - No -> Label as `"T"`
        - Yes:
            - Has the `cue` field?
                - Yes -> Label as `"C"`
                - No:
                    - Has the `scope` field?
                        - Yes -> Label as `"F"`
                        - No:
                            - Label as `"T"`

A brief explanation is: 
- If there is no `negation` field, which corresponds to no negation in the sentence, we mark all tokens as true.
- If there is a `negation` field and it contains a `cue` value, we assume that this is a cue regardless of the other values.
- If there is no `cue` but there is a `scope`, we mark it as inside the scope, `"F"`
- If there is no `cue` or `scope`, we mark it as outside the scope, `"T"`


## Defining the Data Fields


In [9]:
from torchtext import data
from NSR.Data import StarSEM2012

The data cleaning pipeline:

In [10]:
def get_negations(x):
    """Check if the tokens contain negations."""
    neg = x.get("negation", "T")
    if neg == "T":
        return neg
    return neg[0]

def proc_negations(x):
    """Label the tokens according to the rules."""
    if x == "T":
        return x
    if "cue" in x:
        return "C"
    else:
        if "scope" in x:
            return "F"
        return "T"

labelpipeline = data.Pipeline(get_negations).add_after(proc_negations)

We extract the sentence ID, source, number of negations, tokens, lemmas, xpos tags, and labels.

In [11]:
ID = data.RawField(preprocessing=lambda x: int(x))
SRC = data.RawField()
NEGS = data.RawField(preprocessing=lambda x: int(x))
FORM = data.Field(lower=True)
LEMMA = data.RawField()
XPOS = data.RawField()
LABS = data.Field(
    unk_token=None,
    preprocessing=labelpipeline
)

In [12]:
fields = {
    "id": ("id", ID),
    "source": ("source", SRC),
    "negations": ("negations", NEGS),
    "nodes.form": ("tokens", FORM),
    "nodes.properties.lemma": ("lemma", LEMMA),
    "nodes.properties.xpos": ("xpos", XPOS),
    "nodes": ("labels", LABS),
}

The data is located in the following files:
* train -> `cdt.epe`
* development -> `cdd.epe`
* evaluation -> `cde.epe`

In [13]:
train, dev, test = StarSEM2012.splits(   
    path="negation/", 
    train="cdt.epe", 
    validation="cdd.epe", 
    test="cde.epe", 
    fields=fields
)

In [14]:
print("N_train: ", len(train))
print("N_dev: ", len(dev))
print("N_test: ", len(test))

N_train:  3643
N_dev:  787
N_test:  1089


# Building the vocabularies

In [15]:
FORM.build_vocab(train, max_size = 3000)
LABS.build_vocab(train)

In [16]:
#its = data.BucketIterator(dev, batch_size=10, shuffle=True)

its=data.BucketIterator(train,
            shuffle=True,
            batch_size=10,
            sort_key=lambda x: len(x.form),
            sort=True)

In [17]:
list(its)

[
 [torchtext.data.batch.Batch of size 10]
 	[.id]:[95, 56, 39, 147, 211, 189, 186, 90, 107, 106]
 	[.source]:['baskervilles03', 'baskervilles03', 'baskervilles03', 'baskervilles14', 'baskervilles13', 'baskervilles05', 'baskervilles05', 'baskervilles05', 'baskervilles04', 'baskervilles04']
 	[.negations]:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 	[.form]:[torch.LongTensor of size 3x10]
 	[.lemma]:[['``', 'Yes', '.'], ['``', 'Excellent', '!'], ['Another', 'point', '.'], ['So', '!'], ['Lestrade', '.'], ['CARTWRIGHT', '.'], ['BASKERVILLE', '.'], ['Grimpen', '.'], ['Halloa', '!'], ['Halloa', '!']]
 	[.xpos]:[['``', 'UH', '.'], ['``', 'JJ', '.'], ['DT', 'NN', '.'], ['RB', '.'], ['VB', '.'], ['JJ', '.'], ['FW', '.'], ['UH', '.'], ['UH', '.'], ['UH', '.']]
 	[.neg]:[torch.LongTensor of size 3x10],
 
 [torchtext.data.batch.Batch of size 10]
 	[.id]:[205, 27, 22, 194, 184, 163, 150, 143, 121, 100]
 	[.source]:['baskervilles04', 'baskervilles04', 'baskervilles04', 'baskervilles03', 'baskervilles03', 'bask

In [42]:
x = 0
for i, b in enumerate(its):
    if i == 0:
        x = b
        break
        

AttributeError: 'tuple' object has no attribute 'is_target'

In [22]:
!conda list

# packages in environment at /Users/sigurd/anaconda3/envs/nlp:
#
# Name                    Version                   Build  Channel
_pytorch_select           0.1                       cpu_0  
appnope                   0.1.0                    py37_0  
asn1crypto                1.3.0                    py37_0  
astroid                   2.3.3                    py37_0  
backcall                  0.1.0                    py37_0  
blas                      1.0                         mkl  
boto                      2.49.0                   py37_0  
boto3                     1.12.0                     py_0  
botocore                  1.15.0                     py_0  
bz2file                   0.98                     py37_1  
ca-certificates           2020.1.1                      0  
certifi                   2020.4.5.1               py37_0  
cffi                      1.14.0           py37hb5b8e2f_0  
chardet                   3.0.4                 py37_1003  
cryptography              2.

In [18]:
import torchtext

In [None]:
torchtext.vocab.Vectors