In [None]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite
import pandas as pd
import spacy
from spacy import displacy
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
#stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
from IPython.display import display

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from itertools import chain
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.2.2.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Helper functions

In [None]:
def get_root(phrase):
  for token in phrase:
    if token.dep_ == 'ROOT':
      return token

#takes a phrase as input in the tokenized fromat
#breadth-first search of the dependency tree of the given phrase
#returns a the dictionary with the length of the path to root for each token in the sentence
def len_path_root(phrase):
  dist = 0
  root_token = get_root(phrase)
  children = list(root_token.children)
  lengths = {
      root_token: dist
  }
  children_count = len(list(children))
  while children_count != 0:
    dist += 1
    new_children = []
    for token in children:
      lengths[token] = dist
      new_children.extend(token.children)
    children=new_children
    children_count = len(list(new_children))

  #assign length -1 (chosen randomly) for nodes not connected to the root
  for token in phrase:
    if token not in lengths:
      lengths[token] = -1

  return lengths

#takes the dataset, phrase_id and chapter_id
#retruns the phrase in a string by merging the words
#used to iterate in the training and testing set for extracting phrases
def get_text(table, id,chapter):
  phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
  return ' '.join(join_punctuation(phrase_table['word'].values))

def join_punctuation(seq, characters='.,;?!'):
    characters = set(characters)
    seq = iter(seq)
    current = next(seq)

    for nxt in seq:
        if nxt in characters:
            current += nxt
        else:
            yield current
            current = nxt

    yield current

# def get_text(table, id,chapter):
#   phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
#   return ' '.join(phrase_table['word'].values)

#returns a list with the labels of a phrase identified with the ch_id, and phr_id
def get_labels(table, id,chapter):
  phrase_table = table[(table['phrase_id'] == id) & (table['chapter_id'] == chapter)]
  return phrase_table['label'].values

#adjusted lemmatization for nltk library
#offers the POS as a parameter to lemmatization function to make it more precise
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#used to match the length of the tokenization with the length of the filtered table
#initially implemented for the previous assignment of data preprocessing
#treats separately some excpetions found
def tokenize(arg, ch = 'baskervilles03', ph = 21):

  if (ph, ch) == (436, 'wisteria02'):
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[0:2])
      retokenizer.merge(arg[4:7])
    return arg

  if (ph, ch) in [(450, 'cardboard'),(457, 'cardboard')]:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[2:4])
      retokenizer.merge(arg[0:2])
    return arg


  no_exc = [('baskervilles03', 16), ('baskervilles03', 20), ('baskervilles11', 45), ('baskervilles12', 283), ('baskervilles13', 271), ('baskervilles14', 55)]
  retok1_pos = []#for -
  retok2_pos = []#for `

  #1
  shift = 0
  cr_pos = 0
  for token in arg:
    if token.text == '-':
      retok1_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text

  for pos in retok1_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-1-shift:pos+2-shift])
      shift += 2

  #2
  shift = 0
  cr_pos = 0
  prev_char = 0
  for token in arg:
    if token.text =='`' and prev_char == '`':
      retok2_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text

  for pos in retok2_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-shift-1:pos+1-shift])
      shift += 1

  #3
  retok2_pos = []
  suf = ['66', '86','ve','m']
  shift = 0
  cr_pos = 0
  prev_char = 0
  for token in arg:
    if token.text in suf and prev_char == "'" or token.text == '.' and prev_char == "No" and (ch,ph) not in no_exc:
      retok2_pos.append(cr_pos)
    cr_pos+=1
    prev_char = token.text

  for pos in retok2_pos:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[pos-shift-1:pos+1-shift])
      shift += 1

  if ph in [0,'0']:
    with arg.retokenize() as retokenizer:
      retokenizer.merge(arg[2:4])


  return arg

## [Not needed anymore] Testing attributes on individual pre-set phrase before automatically adding to the table dataset

In [None]:
#testing if tokenize and data from table have the same length for each phrase
all_ch_ids = train_data['chapter_id'].unique()
mismatch=[]
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase = get_text(ph, ch)
    phr_doc = nlp(phrase)
    tok = tokenize(phr_doc, ch, ph)
    if len(filter_ph) != len(tok):
      mismatch.append((ch,ph))
print(len(mismatch))

NameError: ignored

In [None]:
#testing if there is any negation in
all_ch_ids = train_data['chapter_id'].unique()
mismatch=[]
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase = get_text(ph, ch)
    phr_doc = nlp(phrase)
    tok = tokenize(phr_doc, ch, ph)
    if len(filter_ph) != len(tok):
      mismatch.append((ch,ph))
print(len(mismatch))

In [None]:
#errors
phrase = get_text(436, 'wisteria02')
phr_doc = nlp(phrase)
#toks = tokenize(phr_doc,436)
for tok in phr_doc:
  print(tok,'\n')

# flag = 0
# if(all(x in abc for x in mismatch)):
#     flag = 1
# print(flag)

In [None]:
#nbor(d) - neighbour in the initial sentence at distance d -/+ -> to left/right
phr1 = "He is interested in learning Natural Language Processing."
phr2 = "I stood upon the hearth-rug and picked up the stick which our visitor had left behind him the night before."
phr3 = "Gus Proto is a Python developer currently working for a London-based Fintech company"

phr_doc = nlp(phr2)
res = len_path_root(phr_doc)

for token in phr_doc:
  print(token.text, res[token], "\n")

displacy.render(phr_doc, style="dep", jupyter=True)

In [None]:
phr4 = "Mr. Sherlock Holmes , who was usually very late in the mornings , save upon those not infrequent occasions when he was up all night , was seated at the breakfast table ."
phr4 = "guru99 is a totally new kind of learning experience."
phr4 = "The striped bats are hanging on their feet for best"

#phr4 = sent_tokenize(phr4)
words_list = nltk.word_tokenize(phr4)
print(tokenize(words_list))
print(words_list)
#adjusted lemma
#lemmatizer = WordNetLemmatizer()
#print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words_list])

#POS 1,2
# fine_tags = nltk.pos_tag(words_list)
# coarse_tags = nltk.pos_tag(words_list, tagset='universal')
# print(fine_tags)
# print(coarse_tags)

In [None]:
id = 15
ch = 'baskervilles01'
txt = get_text(id,ch)
txt = "I wouldn't do that"
phr_doc = nlp(txt)

#print(len(list(phr_doc)))

# with phr_doc.retokenize() as retokenizer:
#     retokenizer.merge(phr_doc[21:23])

#phr_doc = re_tokenize(phr_doc)
for token in phr_doc:
  print(token)

#displacy.render(phr_doc, style="dep", jupyter=True)

In [None]:
#backup
def sent2feature(sentence, ch = 'baskervilles03', ph = 21):
  sent_feat = []
  lemmatizer = WordNetLemmatizer()
  sent_doc = nlp(sentence)
  tokens = tokenize(sent_doc, ch, ph)
  lengths = len_path_root(tokens)

  ord = 0
  shift = 0
  for tok in tokens:
    features = word2feature(tok)
    features['len_path_root'] = lengths[tok]
    sent_feat.append(features)

  return sent_feat

def process_data(table):
  all_ch_ids = table['chapter_id'].unique()
  all_features = []
  all_labels = []
  for ch in all_ch_ids:
    filter_ch = table[table['chapter_id'] == ch]
    all_ph_ids = filter_ch['phrase_id'].unique()
    for ph in all_ph_ids:
      filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
      phrase = get_text(table, ph, ch)
      labels = get_labels(table, ph, ch)
      all_features.append(sent2feature(phrase, ch, ph))
      all_labels.append(labels)

  return all_features, all_labels

## Importing datasets

In [None]:
#merging test datasets
test_card = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-test-cardboard.txt', sep="\t", header = None)
test_circ = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-test-circle.txt', sep="\t", header = None)

frames = [test_card, test_circ]
test_data = pd.concat(frames)
test_data.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)
#print(test_data.head(40))

#train & dev
train_data = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-training.txt', sep="\t", header = None)
dev = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/ATM/SEM-2012-dev.txt', sep="\t", header = None)

train_data.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)
dev.rename(columns={1: 'phrase_id', 0: 'chapter_id', 2:'word_id', 3:'word', 4:'label'}, inplace=True)

#print(dev.head(10))
#counting B-Neg values
#print(train_data['label'].value_counts()['B-NEG'])

#merged datasets in test_data; train_data and dev separate

## Dataset exploration

In [None]:
all_ch_ids = train_data['chapter_id'].unique()
print(all_ch_ids)

['baskervilles01' 'baskervilles02' 'baskervilles03' 'baskervilles04'
 'baskervilles05' 'baskervilles06' 'baskervilles07' 'baskervilles08'
 'baskervilles09' 'baskervilles10' 'baskervilles11' 'baskervilles12'
 'baskervilles13' 'baskervilles14' 'wisteria01' 'wisteria02']


In [None]:
#number of phrases/chapter
all_ch_ids = dev['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = dev[dev['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  print(len(all_ph_ids))

347
440


In [None]:
#number of phrases/chapter
all_ch_ids = train_data['chapter_id'].unique()
all_features = []
all_labels = []
abc=[]
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  abc.append(len(all_ph_ids))
print(sum(abc))

3644


In [None]:
all_ch_ids = test_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = test_data[test_data['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  print(len(all_ph_ids))

496
371
222


In [None]:
#num chapters training
phrase_lengths = []

all_ch_ids = train_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = train_data[train_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase_lengths.append(len(filter_ph))

print(max(phrase_lengths))
print(min(phrase_lengths))
print(sum(phrase_lengths) / len(phrase_lengths))

83
2
17.961306256860592


In [None]:
phrase_lengths = []

all_ch_ids = test_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = test_data[test_data['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase_lengths.append(len(filter_ph))

print(max(phrase_lengths))
print(min(phrase_lengths))
print(sum(phrase_lengths) / len(phrase_lengths))

68
2
17.6455463728191


In [None]:
phrase_lengths = []

all_ch_ids = dev['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = dev[dev['chapter_id'] == ch]
  all_ph_ids = filter_ch['phrase_id'].unique()
  for ph in all_ph_ids:
    filter_ph = filter_ch[filter_ch['phrase_id'] == ph]
    phrase_lengths.append(len(filter_ph))

print(max(phrase_lengths))
print(min(phrase_lengths))
print(sum(phrase_lengths) / len(phrase_lengths))

63
2
17.238881829733163


In [None]:
print(len(train_data))
print(len(dev))
print(len(test_data))

65451
13567
19216


In [None]:
all_ch_ids = test_data['chapter_id'].unique()
print(all_ch_ids)

['cardboard' 'circle01' 'circle02']


In [None]:
all_ch_ids = test_data['chapter_id'].unique()
all_features = []
all_labels = []
for ch in all_ch_ids:
  filter_ch = test_data[test_data['chapter_id'] == ch]
  #print(len(filter_ch))
  all_ph_ids = filter_ch['phrase_id'].unique()
  print(len(all_ph_ids))

496
371
222


In [None]:
print(list(train_data['label'].values).count('B-NEG'))
print(list(train_data['label'].values).count('I-NEG'))
print(list(train_data['label'].values).count('O'))

print('\n', "dev")
print(list(dev['label'].values).count('B-NEG'))
print(list(dev['label'].values).count('I-NEG'))
print(list(dev['label'].values).count('O'))

print('\n', "test")
print(list(test_data['label'].values).count('B-NEG'))
print(list(test_data['label'].values).count('I-NEG'))
print(list(test_data['label'].values).count('O'))

## CRF functions

In [None]:
#takes a token as input
#returns True if token should be kept, or False if it is filtered
#could be changed depending on performance
def keep(tok):
  neg_list = ['nor', 'Nor', 'neither', 'Neither', 'without', 'Without', 'nobody', 'Nobody', 'none', 'None', 'nothing', 'Nothing',
            'never', 'not', 'no', 'Never', 'Not', 'No', 'nowhere', 'non', 'Nowhere', 'Non', "n't", "rather", "than", 'for', 'the']
  if tok.text in neg_list:
    return True
  # if tok.is_punct or tok.is_stop or tok.text == "``":
  if tok.is_punct or tok.text == "``":
    return False
  return True

def word2feature(token):
  prefixes = ['un', 'in', 'im','il', 'dis', 'non', 'ir',
              'Un', 'In', 'Im','Il', 'Dis', 'Non', 'Ir']

  pref = 0
  for p in prefixes:
    if token.text.startswith(p):
      pref = 1
      break

  suf = 0
  if 'less' in token.text:
    suf = 1

  #for feature selection the unwanted features can be commented
  features = {
    'text': token.text,
    'lemma':token.lemma_,
    'fine_pos': token.pos_,
    'coarse_pos': token.tag_,
    'dependency':token.dep_,
    #'head':token.head.text,
    'suffix':suf,
    'prefix': pref
  }

  return features

#takes as input text of a sentence
#returns a list of dictionaries with the features of its tokens
def sent2feature(sentence, labels, is_test, ch = 'baskervilles03', ph = 21):
  sent_feat = []
  lemmatizer = WordNetLemmatizer()
  sent_doc = nlp(sentence)
  tokens = tokenize(sent_doc, ch, ph)
  lengths = len_path_root(tokens)

  ord = 0
  shift = 0
  for tok in tokens:

    if keep(tok) or is_test:
      features = word2feature(tok)
      #features['len_path_root'] = lengths[tok]
      sent_feat.append(features)

    else:
      labels = np.delete(labels, ord-shift)
      shift+=1

    ord+=1

  return sent_feat, labels


#takes the table as an input
#is_test makes the preporcessing function keep all entries in case of test data
#returns the list of lists of dicitionaries with the features

#text->phrase->words->dict of features
#dict of feat->list of dicts->list of lists of dicts
def process_data(table, is_test):
  all_ch_ids = table['chapter_id'].unique()
  all_features = []
  all_labels = []
  for ch in all_ch_ids:
    filter_ch = table[table['chapter_id'] == ch]
    all_ph_ids = filter_ch['phrase_id'].unique()
    for ph in all_ph_ids:
      phrase = get_text(table, ph, ch)
      labels = get_labels(table, ph, ch)
      #filtered
      #print(ch,ph)
      filt_features, filt_labels = sent2feature(phrase, labels, is_test, ch, ph)
      all_features.append(filt_features)
      all_labels.append(filt_labels)

  return all_features, all_labels

#solving exceptions in data
def filter(x,y):
  count = 0
  new_y = []
  for sent,lab in zip(x, y):
    if len(sent) != len(lab):
      count+=1
      lab = np.delete(lab, len(lab)-1)
    new_y.append(lab)
  print(count)
  return new_y

## Testing code

In [None]:
print(y_train2[1])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

3644 3644
787 787


In [None]:
  # for word in sent:
  #   for feat in word:
  #     if :
  #       print(word, feat)
  #       break

In [None]:
#counting B-Neg from dataset
count = 0
for sent in y_train:
  for tok in sent:
    if tok == 'B-NEG':
      count+=1
print(count)

In [None]:
#iterate through tokens of sentence
doc = nlp("I don't like apples and pasta.")
for tok in doc:
  print(isinstance(tok, spacy.tokens.token.Token))

In [None]:
dictio = {'text': 'back',
          'lemma': 'back',
          'fine_pos': 'NOUN',
          'coarse_pos': 'NN',
          'dependency': 'pobj',
          'head': 'with',
          'suffix': 0,
          'prefix': 0,
          'len_path_root': 2}

for key in dictio:
  print(key,' : ' ,dictio[key])

text  :  back
lemma  :  back
fine_pos  :  NOUN
coarse_pos  :  NN
dependency  :  pobj
head  :  with
suffix  :  0
prefix  :  0
len_path_root  :  2


In [None]:
#extracting features for a single sentence
phr1 = get_text(train_data, 121, 'baskervilles08')
lab1 = get_labels(train_data, 121, 'baskervilles08')
doc = nlp(phr1)
doc = tokenize(doc)

#printing tokens
for tok in doc:
  print(tok.text, tok.pos_)

#printing extracted features
ld = sent2feature(phr1, lab1, 'baskervilles08', 121)
for word_dict in ld[0]:
  print(word_dict, '\n')


Now ADV
, PUNCT
all DET
these DET
rooms NOUN
are AUX
unfurnished VERB
and CCONJ
unoccupied ADJ
so SCONJ
that SCONJ
his PRON
expedition NOUN
became VERB
more ADV
mysterious ADJ
than ADP
ever ADV
. PUNCT
{'text': 'Now', 'lemma': 'now', 'fine_pos': 'ADV', 'coarse_pos': 'RB', 'dependency': 'advmod', 'head': 'unfurnished', 'suffix': 0, 'prefix': 0, 'len_path_root': 1} 

{'text': ',', 'lemma': ',', 'fine_pos': 'PUNCT', 'coarse_pos': ',', 'dependency': 'punct', 'head': 'unfurnished', 'suffix': 0, 'prefix': 0, 'len_path_root': 1} 

{'text': 'all', 'lemma': 'all', 'fine_pos': 'DET', 'coarse_pos': 'PDT', 'dependency': 'predet', 'head': 'rooms', 'suffix': 0, 'prefix': 0, 'len_path_root': 2} 

{'text': 'these', 'lemma': 'these', 'fine_pos': 'DET', 'coarse_pos': 'DT', 'dependency': 'det', 'head': 'rooms', 'suffix': 0, 'prefix': 0, 'len_path_root': 2} 

{'text': 'rooms', 'lemma': 'room', 'fine_pos': 'NOUN', 'coarse_pos': 'NNS', 'dependency': 'nsubjpass', 'head': 'unfurnished', 'suffix': 0, 'prefix':

## CRF tutorial with original code

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents]

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [None]:
print(Y_train[7])

In [None]:
print(x_train[7])

In [None]:
#print(X_train[7])
for feat in X_train[7]:
  print(feat)

In [None]:
for feat in x_train[1]:
  print(feat)

{'bias': 1.0, 'text': 'Mr.', 'lemma': 'Mr.', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'compound', 'head': Holmes, 'suffix': 0, 'prefix': 0, 'len_path_root': 2}
{'bias': 1.0, 'text': 'Sherlock', 'lemma': 'Sherlock', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'compound', 'head': Holmes, 'suffix': 0, 'prefix': 0, 'len_path_root': 2}
{'bias': 1.0, 'text': 'Holmes', 'lemma': 'Holmes', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'nsubj', 'head': save, 'suffix': 0, 'prefix': 0, 'len_path_root': 1}
{'bias': 1.0, 'text': 'usually', 'lemma': 'usually', 'fine_pos': 'ADV', 'coarse_pos': 'RB', 'dependency': 'advmod', 'head': was, 'suffix': 0, 'prefix': 0, 'len_path_root': 3}
{'bias': 1.0, 'text': 'late', 'lemma': 'late', 'fine_pos': 'ADV', 'coarse_pos': 'RB', 'dependency': 'acomp', 'head': was, 'suffix': 0, 'prefix': 0, 'len_path_root': 3}
{'bias': 1.0, 'text': 'the', 'lemma': 'the', 'fine_pos': 'DET', 'coarse_pos': 'DT', 'dependency': 'det', 'head': mornings

##Baseline model

In [None]:
#given dictionary of features of a token
#output label->B/
def get_pred(features_dict):

  text_from = features_dict['text']
  neg_list = ['nor', 'Nor', 'neither', 'Neither', 'without', 'Without', 'nobody', 'Nobody', 'none', 'None', 'nothing', 'Nothing',
              'never', 'not', 'no', 'Never', 'Not', 'No', 'nowhere', 'non', 'Nowhere', 'Non', "n't", "rather", "than", 'for', 'the']

  if text_from in neg_list:
    return 'B-NEG'

  if features_dict['suffix'] == 1 or features_dict['prefix'] == 1:
   return 'B-NEG'
  return 'O'


## Testing CRF model

In [None]:
#data final form
x_train, y_trainin = process_data(train_data, False)
x_test, y_test = process_data(dev, True)

y_train = filter(x_train,y_trainin)
y_test = filter(x_test,y_test)


20
3


In [None]:
print(x_train[1])

[{'text': 'Mr.', 'lemma': 'Mr.', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'compound', 'suffix': 0, 'prefix': 0}, {'text': 'Sherlock', 'lemma': 'Sherlock', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'compound', 'suffix': 0, 'prefix': 0}, {'text': 'Holmes', 'lemma': 'Holmes', 'fine_pos': 'PROPN', 'coarse_pos': 'NNP', 'dependency': 'nsubj', 'suffix': 0, 'prefix': 0}, {'text': 'who', 'lemma': 'who', 'fine_pos': 'PRON', 'coarse_pos': 'WP', 'dependency': 'nsubj', 'suffix': 0, 'prefix': 0}, {'text': 'was', 'lemma': 'be', 'fine_pos': 'AUX', 'coarse_pos': 'VBD', 'dependency': 'relcl', 'suffix': 0, 'prefix': 0}, {'text': 'usually', 'lemma': 'usually', 'fine_pos': 'ADV', 'coarse_pos': 'RB', 'dependency': 'advmod', 'suffix': 0, 'prefix': 0}, {'text': 'very', 'lemma': 'very', 'fine_pos': 'ADV', 'coarse_pos': 'RB', 'dependency': 'advmod', 'suffix': 0, 'prefix': 0}, {'text': 'late', 'lemma': 'late', 'fine_pos': 'ADJ', 'coarse_pos': 'JJ', 'dependency': 'acomp', 'suffix': 0,

Baseline results

In [None]:
x_test, unf_ytest = process_data(test_data, True)
y_test = filter(x_test,unf_ytest)

27


In [None]:
y_hat_base=[]#final output -> list of lists of chars
for phrase in x_test:
  y_hat_phrase=[]
  for word in phrase:
    y_hat_phrase.append(get_pred(word))
  y_hat_base.append(y_hat_phrase)

In [None]:
print(len(y_hat_base))
print(len(y_test))

787
787


In [None]:
print(len(y_hat_base[1]))
print(len(y_test[1]))

25
25


In [None]:
for i in range(len(y_test)):
  if len(y_test[i])!=len(y_hat_base[i]):
    print("check again")


In [None]:
labels = ['O', 'B-NEG', 'I-NEG']
metrics.flat_f1_score(y_test, y_hat_base, average='weighted', labels=labels)

0.9527520340050186

In [None]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_hat_base, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

           O      0.999     0.929     0.963     18915
       B-NEG      0.161     0.967     0.276       269
       I-NEG      0.000     0.000     0.000         5

    accuracy                          0.929     19189
   macro avg      0.387     0.632     0.413     19189
weighted avg      0.987     0.929     0.953     19189



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model 1

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd',
    c2=0.1,
    max_iterations=1000,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)

In [None]:
labels = ['O', 'B-NEG', 'I-NEG']
y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.9975257255186872

In [None]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

           O      0.998     0.999     0.999     13385
       B-NEG      0.934     0.881     0.906       176
       I-NEG      1.000     0.667     0.800         3

    accuracy                          0.998     13564
   macro avg      0.977     0.849     0.902     13564
weighted avg      0.998     0.998     0.998     13564



Model 2

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)

In [None]:
labels = ['O', 'B-NEG', 'I-NEG']
y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.9972982039350066

In [None]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

           O      0.999     0.999     0.999     13385
       B-NEG      0.908     0.892     0.900       176
       I-NEG      1.000     0.333     0.500         3

    accuracy                          0.997     13564
   macro avg      0.969     0.741     0.799     13564
weighted avg      0.997     0.997     0.997     13564



Hyperparameter tuning on model 2

In [None]:
labels = ['O', 'B-NEG', 'I-NEG']
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

# search 100 iter
rs = RandomizedSearchCV(crf, params_space,
                        cv=5,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=100,
                        scoring=f1_scorer)
rs.fit(x_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.116733206155079, 'c2': 0.025018142042647337}
best CV score: 0.9967958620097324
model size: 0.05M


Final testing set - We will use hypertuned model from above

In [None]:
x_test, unf_ytest = process_data(test_data, True)
y_test = filter(x_test,unf_ytest)

27


In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(x_test)
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

           O      0.998     0.998     0.998     18915
       B-NEG      0.865     0.907     0.886       269
       I-NEG      0.000     0.000     0.000         5

    accuracy                          0.997     19189
   macro avg      0.621     0.635     0.628     19189
weighted avg      0.996     0.997     0.996     19189



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(7))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-7:])

Top positive:
7.492896 B-NEG    lemma:nowhere
5.715529 B-NEG    lemma:nor
5.611259 O        fine_pos:VERB
5.003009 O        fine_pos:AUX
4.920772 O        fine_pos:NOUN
4.903639 B-NEG    suffix
4.858742 O        fine_pos:PRON

Top negative:
-3.260833 O        lemma:without
-3.309972 O        text:no
-3.491863 O        prefix
-3.680448 O        lemma:never
-4.100059 B-NEG    fine_pos:NOUN
-4.887205 O        suffix
-5.193294 B-NEG    fine_pos:ADV


## Error Analysis

In [None]:
#solving exceptions in data
#unf, Y_pred
def restore(x,y):
  count = 0
  new_y = []
  for sent,lab in zip(x, y):
    if len(sent) != len(lab):
      lab = np.insert(lab, len(lab),'O')
    new_y.append(lab)
  return new_y
def merged(my_list):
  new_format=[]
  for llist in my_list:
    for el in llist:
      new_format.append(el)
  new_format = np.array(new_format)
  return new_format

#lengths y_pred = x_test = y_test
# print(len(test_data))

# count = 0
# for prop in x_test:
#   for word in prop:
#     count+=1
# print(count)

In [None]:
y_pred = restore(unf_ytest, y_pred)
outputs = merged(y_pred)
test_data['prediction'] = outputs
non_match = test_data[test_data['prediction'] != test_data['label']]
false_neg = non_match[(non_match['prediction']=='O') & (non_match['label']=='B-NEG')]
false_pos = non_match[(non_match['label']=='O') & (non_match['prediction']=='B-NEG')]

In [None]:
print(len(false_neg))
print(len(false_pos))

25
37


In [None]:
display(false_pos)

Unnamed: 0,chapter_id,phrase_id,word_id,word,label,prediction
52,cardboard,1,5,unfortunately,O,B-NEG
2601,cardboard,127,12,intact,O,B-NEG
2789,cardboard,140,11,nothing,O,B-NEG
4518,cardboard,233,12,nothing,O,B-NEG
4985,cardboard,261,14,nothing,O,B-NEG
6967,cardboard,353,30,not,O,B-NEG
7476,cardboard,376,3,without,O,B-NEG
7613,cardboard,382,25,never,O,B-NEG
7961,cardboard,395,30,had,O,B-NEG
8028,cardboard,401,2,n't,O,B-NEG


In [None]:
display(false_neg)

Unnamed: 0,chapter_id,phrase_id,word_id,word,label,prediction
347,cardboard,12,32,unsolved,B-NEG,O
589,cardboard,23,17,incredulity,B-NEG,O
669,cardboard,27,4,far,B-NEG,O
751,cardboard,31,5,injustice,B-NEG,O
904,cardboard,41,6,unframed,B-NEG,O
3317,cardboard,167,9,discoloured,B-NEG,O
6238,cardboard,322,29,undoubtedly,B-NEG,O
6480,cardboard,331,1,unsuccessful,B-NEG,O
7962,cardboard,395,31,never,B-NEG,O
8353,cardboard,413,13,ceaseless,B-NEG,O
