In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import json

In [2]:
Directory = "UD_English-EWT/"
#Order: dev, test, train
Filename = ["en_ewt-ud-dev.conllu"]
Jsons = []

for f in Filename:
    Path = "".join((Directory, f))
    
    with open(Path, 'r', encoding='UTF-8') as j:
        i = 0
        for line in j:
            conllu = line.split()
   
    j.close()


In [3]:
def create_features(filepath):
    
    
    with open(filepath, 'r', encoding='UTF-8') as f:
        x_toadd = []
        y_toadd = []
        x_final = []
        y_final = []
        
        for line in f:
                       
            conllu = line.split()
            #case: end of sentence reached
            if len(conllu) == 0:
                x_final.append(x_toadd)
                y_final.append(y_toadd)
                x_toadd = []
                y_toadd = []                
            elif conllu[0].isnumeric():
                word = conllu[1]
                extra_tags = conllu[5].split('|')
                wordtype = conllu[3]
                
                
                features = ['bias',
                            'word.lower=' + word.lower(),
                            'word.iscapitalized=%s' % word.istitle(),
                            'word.isalphabetic=%s' % word.isalpha(),
                            'word.isalnum=%s' % word.isalnum(),
                            'word.prefix=' + word[:3],
                            'word.suffix=' + word[-3:]]
                
                for x in extra_tags:
                    if 'PronType' in x:
                        features.append('word.prontype=' + x[9:])
                        
                x_toadd.append(features)
                y_toadd.append(wordtype)
                
    f.close()
                
    return x_final, y_final
                

In [4]:
dev_x_unneighbored, dev_y = create_features("UD_English-EWT/en_ewt-ud-dev.conllu")
test_x_unneighbored, test_y = create_features("UD_English-EWT/en_ewt-ud-test.conllu")
train_x_unneighbored, train_y = create_features("UD_English-EWT/en_ewt-ud-train.conllu")

In [5]:
def add_neighbors(x_data):
    
    x_data_final = []
    
    for sentence in x_data:
        final_sentence = []
        for i in range(0, len(sentence)):
            modified_word = []
            modified_word.extend(sentence[i])

            if i == 0:
                modified_word.append('BOS')
            else:
                prev_word = sentence[i-1]
                modified_word.extend(['-1:' + x for x in prev_word[1:]])

            if i == (len(sentence)-1):
                modified_word.append('EOS')
            else:
                next_word = sentence[i+1]
                modified_word.extend(['+1:' + x for x in next_word[1:]])

            final_sentence.append(modified_word)
        x_data_final.append(final_sentence)
            
    return x_data_final

In [6]:
dev_x = add_neighbors(dev_x_unneighbored)
test_x = add_neighbors(test_x_unneighbored)
train_x = add_neighbors(train_x_unneighbored)

In [7]:
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(train_x, train_y):
    trainer.append(xseq, yseq)

In [8]:
trainer.set_params({
    'c1': 1.0,  
    'c2': 1e-3,  
    'max_iterations': 50,  

    #'feature.possible_transitions': True
})

In [9]:
trainer.train('test')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 180973
Seconds required: 1.006

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 494379.656840
Feature norm: 1.000000
Error norm: 75066.795051
Active features: 89808
Line search trials: 1
Line search step: 0.000008
Seconds required for this iteration: 0.897

***** Iteration #2 *****
Loss: 435045.781791
Feature norm: 2.044834
Error norm: 61066.151824
Active features: 90860
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.403

***** Iteration #3 *****
Loss: 366063.364580
Feature norm: 4.285430
Error norm: 82590.437831
Active features: 91586
Line search trials: 1
Line search step: 1.000000
Seconds required 

***** Iteration #40 *****
Loss: 37476.063688
Feature norm: 148.082019
Error norm: 680.698046
Active features: 28193
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.413

***** Iteration #41 *****
Loss: 37329.700315
Feature norm: 150.140631
Error norm: 2403.566327
Active features: 26849
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.405

***** Iteration #42 *****
Loss: 37111.234877
Feature norm: 151.583081
Error norm: 1206.059644
Active features: 26753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.418

***** Iteration #43 *****
Loss: 37000.556796
Feature norm: 152.438288
Error norm: 353.699068
Active features: 26599
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.416

***** Iteration #44 *****
Loss: 36778.352030
Feature norm: 154.876342
Error norm: 574.569090
Active features: 25887
Line search trials: 1
Line search step: 1.0000

In [10]:
tagger = pycrfsuite.Tagger()
tagger.open('test')

<contextlib.closing at 0x1791205a780>

In [11]:
example_sent = test_x[1]

print("Predicted:", ' '.join(tagger.tag(example_sent)))
print("Correct:  ", ' '.join(test_y[1]))

Predicted: PRON SCONJ PROPN VERB ADP PRON NOUN PUNCT NOUN PUNCT CCONJ ADV NOUN PUNCT NOUN ADP DET ADJ PUNCT VERB NOUN NOUN PUNCT
Correct:   PRON SCONJ PROPN VERB ADP PRON NOUN PUNCT NOUN PUNCT CCONJ ADV NOUN PUNCT NOUN ADP DET ADV PUNCT ADJ NOUN NOUN PUNCT


In [12]:
def evaluate(data_x, data_y):
    scores = {}
    tagger_2 = pycrfsuite.Tagger()
    tagger_2.open('test')
    
    for i in range(0, len(data_x)):
        predicted = tagger_2.tag(data_x[i])
        actual = data_y[i]
        
        for j in range(0, len(predicted)):
            
            if predicted[j] not in scores:
                scores[predicted[j]] = {'TP': 0, 'FP': 0, 'FN': 0}
            if actual[j] not in scores:
                scores[actual[j]] = {'TP': 0, 'FP': 0, 'FN': 0}
                
            if predicted[j] == actual[j]:
                scores[predicted[j]]['TP'] += 1
            else:
                scores[predicted[j]]['FP'] += 1
                scores[actual[j]]['FN'] += 1
        
    return scores
        
def print_evaluate(data):
    for x in data.keys():
        metrics = data[x]
        precision = metrics['TP']/(metrics['TP'] + metrics['FP'])
        recall = metrics['TP']/(metrics['TP'] + metrics['FN'])
        F1 = 2*(precision*recall)/(precision + recall)
        
        print("Stats for %s:" % x)
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F1: ", F1)
        

In [14]:
#X, SYM and INTJ are lesser-occuring elements we evaluated poorly. Messing around with data to try and pinpoint the cause
x = 0
SYM = 0
INTJ = 0

for line in dev_y:
    for element in line:
        if element == 'X':
            x += 1
        elif element == 'SYM':
            SYM += 1
        elif element == 'INTJ':
            INTJ += 1
            
print('X: ', x)
print('SYM: ', SYM)
print('INTJ: ', INTJ)

looking_at = []

for i in range(0, len(dev_x)):
    for word in dev_y[i]:
        if word == 'INTJ':
            looking_at += [[dev_y[i], [x[1][11:] for x in dev_x[i]]]]
            break


X:  155
SYM:  70
INTJ:  115


In [15]:
for element in looking_at:
    tags = element[0]
    words = element[1]
    to_print = []
    for i in range(0, len(tags)):
        if tags[i] == 'INTJ':
            to_print += [words[i]]
            
    print(to_print)

['welcome']
['right']
['no', 'no']
['please']
['whatever']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['lu']
['please']
['hey']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['please']
['plz']
['hi']
['please']
['please']
['please']
['please']
['hahaha']
['he', 'he']
['please']
['lol', 'sry']
['btw']
['please', 'please']
['hello']
['please']
['heh', 'yep']
['greetings']
['please']
['please']
['no']
['lmao']
['lol']
['nope']
['yes']
['like']
['no']
['er', 'no']
['no']
['hell', 'no']
['okay', 'like']
['like']
['like']
['please']
['please']
['please']
['well']
['yes']
['erm', 'ya', 'lols']
['no']
['please']
['uh']
['please']
['please']
['aye']
['yes']
['like']
['plllz']
['pls']
['pls']
['ok', 'well']
['well']
['ty']
['hi']
['yes']
['say']
['please']
['huh']
['yuck']
['ewww']
['yes']
['hi']
['yes']
['plz']
['like']
['okay']
['well']
['please']
['please']
['yes']
['eh', 'eh']
['ok']
['yay']
['p

In [80]:
'word.lower=a'[11:]

'a'

In [62]:
print("Now evaluating dev data: ")
print()
results = evaluate(dev_x, dev_y)
print_evaluate(results)

Now evaluating dev data: 

Stats for ADP:
Precision:  0.9303317535545024
Recall:  0.971301335972291
F1:  0.9503752118131202
Stats for DET:
Precision:  0.9821709491347667
Recall:  0.9889123548046462
F1:  0.9855301236516706
Stats for NOUN:
Precision:  0.8889142857142858
Recall:  0.9266142482725757
F1:  0.907372841810546
Stats for PROPN:
Precision:  0.8613807982740022
Recall:  0.8508257858284497
F1:  0.856070758509783
Stats for VERB:
Precision:  0.93547197640118
Recall:  0.918204849800941
F1:  0.9267579908675799
Stats for PUNCT:
Precision:  0.9906572164948454
Recall:  0.9974051248783652
F1:  0.9940197187651527
Stats for NUM:
Precision:  0.9107142857142857
Recall:  0.9444444444444444
F1:  0.9272727272727271
Stats for PART:
Precision:  0.9487577639751553
Recall:  0.9698412698412698
F1:  0.9591836734693877
Stats for ADJ:
Precision:  0.9013426736719206
Recall:  0.8635346756152126
F1:  0.8820337046558125
Stats for ADV:
Precision:  0.9230769230769231
Recall:  0.8530805687203792
F1:  0.886699507

In [61]:
print("Now evaluating test data: ")
print()
results = evaluate(test_x, test_y)
print_evaluate(results)

Now evaluating test data: 

Stats for PRON:
Precision:  0.9934823091247672
Recall:  0.9888785912882299
F1:  0.9911751045053415
Stats for SCONJ:
Precision:  0.9279538904899135
Recall:  0.8341968911917098
F1:  0.8785811732605728
Stats for PROPN:
Precision:  0.8279362010633156
Recall:  0.8251445086705202
F1:  0.8265379975874547
Stats for VERB:
Precision:  0.9435178165276725
Recall:  0.9388909845341381
F1:  0.9411987143127246
Stats for ADP:
Precision:  0.9412614347616755
Recall:  0.9683011391778108
F1:  0.9545898437499999
Stats for NOUN:
Precision:  0.8773496240601504
Recall:  0.9034599564481006
F1:  0.8902133746572893
Stats for PUNCT:
Precision:  0.9875914731148584
Recall:  0.9993560849967804
F1:  0.9934389502320371
Stats for CCONJ:
Precision:  0.9931972789115646
Recall:  0.989159891598916
F1:  0.991174473862865
Stats for ADV:
Precision:  0.9215686274509803
Recall:  0.8817292006525286
F1:  0.901208837015423
Stats for DET:
Precision:  0.9858416360776088
Recall:  0.9915611814345991
F1:  0.9