In [237]:
import nltk
from nltk.corpus import masc_tagged, treebank
from nltk.tag import hmm
%autoreload 2
from ass3utils import train_unsupervised 
import random

In [287]:
def insert_word(words_by_tag, word):
    if words_by_tag.get(word, False):
        words_by_tag[word] += 1
    else:
        words_by_tag[word] = 1

def get_counts():
    words = {}
    vb_transitions = {}
    
    for sent in masc_tagged.tagged_sents():
        for idx, (word, tag) in enumerate(sent):
            if words.get(tag):
                insert_word(words[tag], word)
            else:
                words[tag] = {}
                insert_word(words[tag], word)
                
            if tag == 'VB' and idx < (len(sent) - 1):
                next_tag = sent[idx + 1][1]
                if vb_transitions.get(next_tag):
                    vb_transitions[next_tag] += 1
                else:
                    vb_transitions[next_tag] = 1
                
    return words, vb_transitions

def get_probabilities(words, transitions, target, word, word_tag):
    total_trans = sum(transitions.values())
    trans_prob = transitions[target] / total_trans
    
    word_prob = words[word_tag][word] / sum(words[word_tag].values())
    
    print(f'Probability of VB being followed by {target} is {trans_prob * 100} %')
    print(f'Probability of {word} within the tag {word_tag} is {word_prob * 100} %')
    
def tag_sents(model):
    sents = ['Once we have finished , we will go out .',
         'There is always room for more understanding between warring peoples .',
         'Evidently , this was one of Jud \'s choicest tapestries , for the noble emitted a howl of grief and rage and leaped from his divan .']

    sents2 = [
        'Misjoggle in a gripty hifnipork .',
        'One fretigy kriptog is always better than several intersplicks .',
        'Hello my friend can you tag some words ineptly'
    ]
    
    sents3 = [
        'Yesterday these fiends operated upon Doggo .',
        'For a time, his own soul and this brain - maggot struggled for supremacy .'
    ]
    
    sents4 = [
        'System that prevents problems with little nephews',
        'Trump, Angered by Investigations, Blows Up Meeting With Democrats',
        'She Had Stage 4 Lung Cancer, and a Mountain to Climb',
        'Business partnership agreements are written agreements which states the rights, responsibility, and accountability of the parties involved in the agreement',
        ]
    
    for sent in sents:
        print(model.tag(sent.split()), '\n')
    print('-' * 100)
    for sent in sents2:
        print(model.tag(sent.split()), '\n')
    print('-' * 100)
    for sent in sents3:
        print(model.tag(sent.split()), '\n')
    print('-' * 100)
    for sent in sents4:
        print(model.tag(sent.split()), '\n')

def log_prob(model):
    test_sents = [
        list(zip('Hi I am dog'.split(), [None] * 4)),
        list(zip('Try using your models as LMs'.split(), [None] * 6)),
        list(zip('Submit your answers'.split(), [None] * 3)),
        list(zip('Is you are we you they us them porridge'.split(), [None] * 9)),
        list(zip('Live computer eat slightly manic bag'.split(), [None] * 6)),
        list(zip('I am outputting a rather probable sentence but this one is still quite long one'.split(), [None] * 15)),
        list(zip('The the the the'.split(), [None] * 4)),
    ]
    
    for sent in test_sents:
        print(sent, 'Probability: ', model.log_probability(sent))
        
def sample_model(model):
    print(model.random_sample(random, 15))
    

In [190]:
get_probabilities(words, transitions, 'DT', 'feel', 'VB')

Probability of VB being followed by DT is 19.507320241842802 %
Probability of feel within the tag VB is 0.21406880071688159 %


In [191]:
train = hmm.HiddenMarkovModelTagger

model = train.train(masc_tagged.tagged_sents())

In [156]:
sents = ['Once we have finished , we will go out .',
         'There is always room for more understanding between warring peoples .',
         'Evidently , this was one of Jud \'s choicest tapestries , for the noble emitted a howl of grief and rage and leaped from his divan .']

for sent in sents:
    print(model.tag(sent.split()), '\n')
    #Understanding should be an adjective

[('Once', 'RB'), ('we', 'PRP'), ('have', 'VBP'), ('finished', 'VBN'), (',', ','), ('we', 'PRP'), ('will', 'MD'), ('go', 'VB'), ('out', 'RP'), ('.', '.')] 

[('There', 'EX'), ('is', 'VBZ'), ('always', 'RB'), ('room', 'NN'), ('for', 'IN'), ('more', 'JJR'), ('understanding', 'NN'), ('between', 'IN'), ('warring', 'VBG'), ('peoples', 'NNS'), ('.', '.')] 

[('Evidently', 'UH'), (',', ','), ('this', 'DT'), ('was', 'VBD'), ('one', 'CD'), ('of', 'IN'), ('Jud', 'PRP'), ("'s", 'VBZ'), ('choicest', 'JJ'), ('tapestries', 'NNS'), (',', ','), ('for', 'IN'), ('the', 'DT'), ('noble', 'JJ'), ('emitted', 'IN'), ('a', 'DT'), ('howl', 'NN'), ('of', 'IN'), ('grief', 'NN'), ('and', 'CC'), ('rage', 'NN'), ('and', 'CC'), ('leaped', 'VBD'), ('from', 'IN'), ('his', 'PRP$'), ('divan', 'NNS'), ('.', '.')] 



In [280]:
sents2 = [
    'Misjoggle in a gripty hifnipork .',
    'One fretigy kriptog is always better than several intersplicks .',
    'I need my stormhammer',
    'Hello my friend can you tag some words ineptly .'
]
for sent in sents2:
    print(model.tag(sent.split()), '\n')

[('Misjoggle', 'RB'), ('in', 'IN'), ('a', 'DT'), ('gripty', 'JJ'), ('hifnipork', 'NNS'), ('.', '.')] 

[('One', 'CD'), ('fretigy', 'NNS'), ('kriptog', 'WDT'), ('is', 'VBZ'), ('always', 'RB'), ('better', 'JJR'), ('than', 'IN'), ('several', 'JJ'), ('intersplicks', 'NNS'), ('.', '.')] 

[('I', 'PRP'), ('need', 'VBP'), ('my', 'PRP$'), ('stormhammer', 'NNS')] 

[('Hello', 'UH'), ('my', 'PRP$'), ('friend', 'NN'), ('can', 'MD'), ('you', 'PRP'), ('tag', 'VBP'), ('some', 'DT'), ('words', 'NNS'), ('ineptly', '.'), ('.', '.')] 



In [236]:
with open('radio_planet_tokens.txt') as radio:
    lines = radio.readlines()
    lines = list(map(lambda x: x.rstrip('\n').split(), lines))
    u_model = train_unsupervised(masc_tagged.tagged_sents(), lines, 10)


Supervised training for initialization (34534 sentences)
Unsupervised training (999 sentences) for up to 10 iterations
iteration 0 logprob -143817.97376630787
iteration 1 logprob -113237.1566587721
iteration 2 logprob -109969.0380902845
iteration 3 logprob -107323.932597381
iteration 4 logprob -105083.54271992383
iteration 5 logprob -103256.08729363863
iteration 6 logprob -101949.46522053852
iteration 7 logprob -100943.05838591327
iteration 8 logprob -100162.11057696443
iteration 9 logprob -99518.63027287401


In [239]:
tag_sents(u_model)

[('Once', 'RB'), ('we', 'PRP'), ('have', 'VBD'), ('finished', 'VBD'), (',', ','), ('we', 'PRP'), ('will', 'MD'), ('go', 'VBP'), ('out', 'RB'), ('.', '.')] 

[('There', 'NNP'), ('is', 'VBZ'), ('always', 'VBN'), ('room', 'NN'), ('for', 'IN'), ('more', 'PRP$'), ('understanding', 'NN'), ('between', 'NN'), ('warring', 'NN'), ('peoples', 'NN'), ('.', 'NN')] 

[('Evidently', "''"), (',', 'WRB'), ('this', 'PRP'), ('was', 'VBD'), ('one', 'CD'), ('of', 'IN'), ('Jud', 'NNP'), ("'s", 'NN'), ('choicest', 'NN'), ('tapestries', 'NN'), (',', 'NN'), ('for', 'NN'), ('the', 'NN'), ('noble', 'NN'), ('emitted', 'NN'), ('a', 'NN'), ('howl', 'NN'), ('of', 'NN'), ('grief', 'NN'), ('and', 'NN'), ('rage', 'NN'), ('and', 'NN'), ('leaped', 'NN'), ('from', 'NN'), ('his', 'NN'), ('divan', 'NN'), ('.', 'NN')] 

----------------------------------------------------------------------------------------------------
[('Misjoggle', 'NN'), ('in', 'NN'), ('a', 'NN'), ('gripty', 'NN'), ('hifnipork', 'NN'), ('.', 'NN')] 

[('O

In [288]:
models = [model, u_model]

for m in models:
#     tag_sents(m)
#     log_prob(m)
    sample_model(m)
    print('W-W' * 100)

[('she', 'PRP'), ('want', 'VBP'), ('like', 'IN'), ('it', 'PRP'), ('stabilize', 'VB'), ('of', 'IN'), ('lines', 'NNS'), ('attend', 'VBP'), ('to', 'TO'), ('enter', 'VB'), ('separates', 'JJ'), ('teabagging', 'VBG'), ('Wind', 'NNP'), ('America', 'NNP'), ('Uncle', 'NNP')]
W-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-W
[('Jud', 'NNP'), ('Arkilu', '...'), ('Formis', ')'), ('but', 'CC'), ('Doggo', 'EX'), ('slowly', 'VBD'), ('the', 'DT'), ('last', 'RBS'), ('kept', 'VBN'), ('gave', 'IN'), ('too', 'RB'), ('an', 'DT'), ('supporters', 'JJS'), ('in', 'IN'), ('her', 'PRP$')]
W-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-WW-W