Markov Model Classifier / Poetry Generator
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import string

In [2]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.

File ‘robert_frost.txt’ already there; not retrieving.



In [3]:
input_files =  ['edgar_allan_poe.txt','robert_frost.txt',]

In [4]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [5]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [6]:
# collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
  print(f'{f} corresponds to label {label}')

  for line in open(f):
    line = line.rstrip().lower()
    if line:
    # remove punctuation
      line = line.translate(str.maketrans('','', string.punctuation))

    input_texts.append(line)
    labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [7]:
train_text, test_text, y_train,y_test = train_test_split(input_texts,labels)

In [8]:
len(y_train) , len(y_test)

(1783, 595)

In [9]:
train_text[:5]

['of those unusual strings',
 'in the ghoulhaunted woodland of weir',
 'that smiled and died in this parterre enchanted',
 'struck the hard cellar bottom and then someone',
 'all all expired save thee save less than thou']

In [10]:
y_train[:10]

[0, 0, 0, 1, 0, 0, 0, 0, 1, 1]

In [11]:
idx = 1
word2idx = {'<unk>':0}

In [12]:
# Populate word2idx
for text in train_text:
  tokens = text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx += 1


In [13]:
word2idx

{'<unk>': 0,
 'of': 1,
 'those': 2,
 'unusual': 3,
 'strings': 4,
 'in': 5,
 'the': 6,
 'ghoulhaunted': 7,
 'woodland': 8,
 'weir': 9,
 'that': 10,
 'smiled': 11,
 'and': 12,
 'died': 13,
 'this': 14,
 'parterre': 15,
 'enchanted': 16,
 'struck': 17,
 'hard': 18,
 'cellar': 19,
 'bottom': 20,
 'then': 21,
 'someone': 22,
 'all': 23,
 'expired': 24,
 'save': 25,
 'thee': 26,
 'less': 27,
 'than': 28,
 'thou': 29,
 'astartes': 30,
 'bediamonded': 31,
 'crescent': 32,
 'other': 33,
 'listening': 34,
 'things': 35,
 'a': 36,
 'broken': 37,
 'drinking': 38,
 'goblet': 39,
 'like': 40,
 'grail': 41,
 'how': 42,
 'could': 43,
 'be': 44,
 'i': 45,
 'thought': 46,
 'dead': 47,
 'were': 48,
 'souls': 49,
 'some': 50,
 'shattered': 51,
 'dishes': 52,
 'underneath': 53,
 'pine': 54,
 'as': 55,
 'if': 56,
 'towers': 57,
 'had': 58,
 'thrown': 59,
 'aside': 60,
 'oh': 61,
 'you': 62,
 'ask': 63,
 'me': 64,
 'what': 65,
 'will': 66,
 'he': 67,
 'do': 68,
 'who': 69,
 'are': 70,
 'they': 71,
 'for': 7

In [14]:
len(word2idx)

2485

In [15]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)

In [16]:
train_text_int[100:105]

[[357, 358, 12, 359, 12, 360, 70, 170, 275],
 [361, 62, 268, 362, 363, 62, 48],
 [68, 364, 365],
 [285, 36, 366, 367, 368, 324, 76, 369, 370],
 [371, 372, 230, 6, 373, 374, 324]]

In [17]:
#initialize A and pi Matrics - for both classes
V = len(word2idx) # vocabulary size

A0 = np.ones((V,V)) # 2-D Matrix
pi0 = np.ones(V) # 1-D vectors

A1 = np.ones((V,V)) # 2-D Matrix
pi1 = np.ones(V) # 1-D vectors

In [18]:
# compute counts for A and Pi
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in sentence
        pi[idx] += 1
      else:
        # the last word exits, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx

In [19]:
compute_counts([t for t, y in zip(train_text_int,y_train) if y==0],A0,pi0)
compute_counts([t for t, y in zip(train_text_int,y_train) if y==1],A1,pi1)


In [20]:
# Normalize  A and Pi so they are valid probability matrics
# convince yourself that this is equivalent to the formulas shown before

A0 /= A0.sum(axis=1,keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1,keepdims=True)
pi1 /= pi1.sum()

In [21]:
# log A and Pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)


In [22]:
# Compute priors
count0= sum(y==0 for y in y_train)
count1 = sum(y==1 for y in y_train)
total = len(y_train)

p0 = count0 / total
p1 = count1 / total

logp0 = np.log(p0)
logp1= np.log(p1)

p0,p1

(0.33763320246775097, 0.662366797532249)

In [23]:
# Build a Classifier

class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0

    for idx in input_:
      if last_idx is None:
        # it's a first token
        logprob += logpi[idx]

      else:
        logprob += logA[last_idx, idx]

      # upadate last_idx
      lst_idx = idx

    return logprob

  def predict(self, inputs):
     predictions = np.zeros(len(inputs))

     for i, input_ in enumerate(inputs):
      posteriors = [self.compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred

     return predictions



In [24]:
# each array must be in orde since are assumed to index these lists
clf = Classifier([logA0, logA1],[logpi0,logpi1],[logp0,logp1])

In [25]:
P_train = clf.predict(train_text_int)
print(f'Train acc:{np.mean(P_train == y_train)}')

Train acc:0.7408861469433539


In [28]:
P_test = clf.predict(test_text_int)
print(f'Test acc:{np.mean(P_test == y_test)}')

Test acc:0.7142857142857143


In [26]:
from sklearn.metrics import confusion_matrix, f1_score

In [27]:
cm = confusion_matrix(y_train, P_train)
cm

array([[ 195,  407],
       [  55, 1126]])

In [29]:
cm_test = confusion_matrix(y_test,P_test)
cm_test

array([[ 56, 139],
       [ 31, 369]])

In [30]:
f1_score(y_train, P_train)

0.8297715549005158

In [31]:
f1_score(y_test,P_test)

0.8127753303964759