In [1]:
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
from sklearn.model_selection import train_test_split

import sys
sys.path.insert(1, '../utils')
from NLP_little_helpers import *
little_helpers()

Using TensorFlow backend.
[nltk_data] Downloading package brown to /home/felipe/nltk_data...
[nltk_data]   Package brown is already up-to-date!


potions: re, numpy as np, pandas as pd, pickle, json, nltk, keras, collections
spells: clean_data, predict_tags, tagged_n_grams, unknown_words_X, check_and_predict


### Training HMM for POS-Tagging

In [2]:

# Separate words and tags in corpus

words = [i[0] for i in [list(zip(*c)) for c in corpus]]
tags = [i[1] for i in [list(zip(*c)) for c in corpus]]

assert len(words) == len(tags)

In [3]:
# Training / Testing data

X_train, X_test, y_train, y_test = train_test_split(words, tags, test_size=0.2, random_state=42)

In [4]:
# Vocabulary used for training

train_words = list(set([word for words in X_train for word in words]))
train_tags = list(set([word for words in y_train for word in words]))

In [5]:
# Functions calls

emission_counts = pt.pair_counts(X_train, y_train)
tag_unigrams = pt.unigram_counts(y_train)
tag_bigrams = pt.bigram_counts(y_train)
tag_starts = pt.starting_counts(y_train)
tag_ends = pt.ending_counts(y_train)

In [7]:
# Model training

model = HiddenMarkovModel(name="brown-hmm-tagger")


# Create states with emission probability distributions P(word | tag) and add to the model

states = {}

for tags, words in emission_counts.items():
    n = tag_unigrams[tags]
    prob = {word:count/n for word, count in words.items()}
    emissions = DiscreteDistribution(prob)
    state = State(emissions, name=tags)
    states[tags] = state
    model.add_states(state)
   

    
# Add edges between states for the observed transition frequencies P(tag_i | tag_i-1)

for tags, counts in tag_starts.items():
    model.add_transition(model.start, states[tags], counts/sum(tag_starts.values()))

for (tag1, tag2), counts in tag_bigrams.items():
    model.add_transition(states[tag1], states[tag2], counts/tag_unigrams[tag1])

for tags, counts in tag_ends.items():
    model.add_transition(states[tags], model.end, counts/tag_unigrams[tags])
    

# Laplace smoothing:

tag_bigrams_test = pt.bigram_counts(y_test)
denominator = len(train_tags)

for (tag1, tag2), counts in tag_bigrams_test.items():
    if (tag1, tag2) in tag_bigrams:
        continue
    if tag1 not in states or tag2 not in states:
        continue
    if tag1 in tag_unigrams:
        denominator += tag_unigrams[tag1]
    model.add_transition(states[tag1], states[tag2], 1/denominator)

    
model.bake()

In [11]:
with open('hmm_model.pkl', 'wb') as outfile:
    pickle.dump(model, outfile)

### Clean Data and Predict POS tags

In [3]:
with open('../Trained_Weights/hmm_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [3]:
to_tag = pd.read_csv('movie_lines.txt', sep='|', error_bad_lines=False,
                                   encoding= 'unicode_escape', header=None)

b'Skipping line 144001: expected 5 fields, saw 6\nSkipping line 144113: expected 5 fields, saw 6\nSkipping line 144283: expected 5 fields, saw 6\nSkipping line 144328: expected 5 fields, saw 6\nSkipping line 144337: expected 5 fields, saw 6\nSkipping line 144400: expected 5 fields, saw 6\nSkipping line 144438: expected 5 fields, saw 6\nSkipping line 225183: expected 5 fields, saw 9\nSkipping line 225288: expected 5 fields, saw 41\nSkipping line 225302: expected 5 fields, saw 6\nSkipping line 225394: expected 5 fields, saw 6\nSkipping line 225625: expected 5 fields, saw 6\n'


In [4]:
data = clean_data(to_tag[4])

NameError: name 'movie_conversation' is not defined

In [None]:
viterbis_dict = predict_tags(data, model)

In [None]:
with open('clean_tagged_data.json', 'w') as outfile:
    json.dumps(viterbis_dict, outfile)