In [None]:
import pandas as pd
import numpy as np
import requests
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
url1 = 'https://ufal.mff.cuni.cz/~pecina/courses/npfl068/data/TEXTEN2.ptg'
url2 = 'https://ufal.mff.cuni.cz/~pecina/courses/npfl068/data/TEXTCZ2.ptg'

In [None]:
def get_data(url):
  # returns list of words and tags of data
  data = requests.get(url)
  data.encoding = 'iso-8859-2'
  lines = data.text.split('\n')[:-1]
  data = []
  for line in lines:
    word, tag = line.split('/')
    data.append((word,tag))
  return data

In [None]:
def split_to_sentences(data):
  sentences = []
  sentence = []
  for word, tag in data:
    if word == '###' and tag == '###':
      if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    else:
      sentence.append((word,tag))

  # add possible last sentence
  if len(sentence) > 0:
    sentences.append(sentence)
  return sentences


In [None]:
# load data for next tasks
dataEN= get_data(url1)
dataCZ= get_data(url2)

In [None]:
S,H,T = dataEN[-40000:], dataEN[-60000: -40000], dataEN[:-60000]
S_cx, H_cx, T_cx = dataEN[:40000], dataEN[40000:-20000], dataEN[-20000:]

# **Brills Tagger**


In [None]:
# install required packages
!pip install nltk



In [None]:
import nltk
from nltk.tbl import demo as brill_demo
from nltk.tag import UnigramTagger, BigramTagger, brill, brill_trainer, DefaultTagger

In [None]:
def train_brill_tagger(train_data, smooth_data, test_data, init_tagger = 'default', max_rules = 250):
  # train brill tagger on train data with baseline tagger trained on smooth data

  # initialize baseline tagger
  baseline_tagger = None
  if init_tagger == 'unigram':
    baseline_tagger = UnigramTagger(train=smooth_data, backoff=DefaultTagger('Z:-------------'))
  if init_tagger == 'default':
    baseline_tagger = DefaultTagger('Z:-------------')
  # choose rule templates
  templates = brill.fntbl37()

  # train brill tagger
  trainer = brill_trainer.BrillTaggerTrainer(baseline_tagger, templates)
  brill_tagger = trainer.train(train_data, max_rules)

  # evalueate on test set
  test_acc = brill_tagger.accuracy(test_data)

  return test_acc

In [None]:
def evaluate_with_brill_tagger(data):
  # try diffrent baseline tagger, number of rules and split of data

  # original split
  S,H,T = data[-40000:], data[-60000: -40000], data[:-60000]

  #crossvalidation split
  S_cx, H_cx, T_cx = data[:40000], data[-20000:],data[40000:-20000]

  # another 3 splits
  S_1, H_1, T_1 = data[20000:60000], data[60000:80000], data[:20000] + data[80000:]
  S_2, H_2, T_2 = data[10000:50000], data[50000:70000], data[:10000] + data[70000:]
  S_3, H_3, T_3 = data[-60000:-20000], data[-20000:], data[:-60000]

  splits = [(0,[S,H,T]), (1,[S_cx, H_cx, T_cx]), (2,[S_1, H_1, T_1]), (3,[S_2, H_2, T_2]), (4,[S_3, H_3, T_3])]


  # define baseline taggers and num of rules to try
  init_taggers = ['default', 'unigram']
  init_taggers = ['unigram']
  rules_num = [50, 100, 250]


  results = {'accuracy': [], 'init_tagger': [], 'rules_num': [],'split':[], 'smooth_data':[]}
  for i,split in splits[1:]:
    T_sentences = split_to_sentences(split[2])
    S_sentences = split_to_sentences(split[0])
    H_sentences = split_to_sentences(split[1])

    for init_tagger in init_taggers:
      for num in rules_num:
        test_acc = train_brill_tagger(train_data= T_sentences, smooth_data= H_sentences, test_data= S_sentences, init_tagger=init_tagger, max_rules=num)
        print(f'Accuracy on test set with baseline tagger: {init_tagger} and number of rules: {num} and split {i} is : {test_acc}')
        results['accuracy'].append(test_acc)
        results['init_tagger'].append(init_tagger)
        results['rules_num'].append(num)
        results['split'].append(i)
        results['smooth_data'].append(init_tagger == 'default')

  return pd.DataFrame(results)



In [None]:
# train brill tagger for english
results = evaluate_with_brill_tagger(dataEN)
results

Accuracy on test set with baseline tagger: default and number of rules: 50 and split 0 is : 0.5564884102938492
Accuracy on test set with baseline tagger: default and number of rules: 100 and split 0 is : 0.604933121268219
Accuracy on test set with baseline tagger: default and number of rules: 250 and split 0 is : 0.6694652308815477
Accuracy on test set with baseline tagger: unigram and number of rules: 50 and split 0 is : 0.8496858133653169
Accuracy on test set with baseline tagger: unigram and number of rules: 100 and split 0 is : 0.8551351915104425
Accuracy on test set with baseline tagger: unigram and number of rules: 250 and split 0 is : 0.8638176935311449
Accuracy on test set with baseline tagger: default and number of rules: 50 and split 1 is : 0.5533922381213021
Accuracy on test set with baseline tagger: default and number of rules: 100 and split 1 is : 0.6019756561628483
Accuracy on test set with baseline tagger: default and number of rules: 250 and split 1 is : 0.6614017254411

Unnamed: 0,accuracy,init_tagger,rules_num,split,smooth_data
0,0.556488,default,50,0,True
1,0.604933,default,100,0,True
2,0.669465,default,250,0,True
3,0.849686,unigram,50,0,False
4,0.855135,unigram,100,0,False
5,0.863818,unigram,250,0,False
6,0.553392,default,50,1,True
7,0.601976,default,100,1,True
8,0.661402,default,250,1,True
9,0.837986,unigram,50,1,False


In [None]:
# train brill tagger for Czech dataset
results = evaluate_with_brill_tagger(dataCZ)
results

Accuracy on test set with baseline tagger: unigram and number of rules: 50 and split 1 is : 0.6014666029013231
Accuracy on test set with baseline tagger: unigram and number of rules: 100 and split 1 is : 0.6089855996599182
Accuracy on test set with baseline tagger: unigram and number of rules: 250 and split 1 is : 0.6253786067272438
Accuracy on test set with baseline tagger: unigram and number of rules: 50 and split 2 is : 0.6177516992353441
Accuracy on test set with baseline tagger: unigram and number of rules: 100 and split 2 is : 0.6238052251486831
Accuracy on test set with baseline tagger: unigram and number of rules: 250 and split 2 is : 0.6382752761257434
Accuracy on test set with baseline tagger: unigram and number of rules: 50 and split 3 is : 0.6293587254302205
Accuracy on test set with baseline tagger: unigram and number of rules: 100 and split 3 is : 0.6321780993164349
Accuracy on test set with baseline tagger: unigram and number of rules: 250 and split 3 is : 0.645290847674

Unnamed: 0,accuracy,init_tagger,rules_num,split,smooth_data
0,0.601467,unigram,50,1,False
1,0.608986,unigram,100,1,False
2,0.625379,unigram,250,1,False
3,0.617752,unigram,50,2,False
4,0.623805,unigram,100,2,False
5,0.638275,unigram,250,2,False
6,0.629359,unigram,50,3,False
7,0.632178,unigram,100,3,False
8,0.645291,unigram,250,3,False
9,0.586176,unigram,50,4,False


In [None]:
# process EN results
with open('sample_data/brill_tagger_EN_results.pkl', 'rb') as f:
    en_results = pickle.load(f)

In [None]:
# overal table EN
en_results.groupby(['init_tagger','rules_num', 'split'])['accuracy'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy
init_tagger,rules_num,split,Unnamed: 3_level_1
default,50,0,0.556488
default,50,1,0.553392
default,50,2,0.565038
default,50,3,0.556711
default,50,4,0.547499
default,100,0,0.604933
default,100,1,0.601976
default,100,2,0.61733
default,100,3,0.604598
default,100,4,0.598084


In [None]:
# mean accuracy and std EN
en_results.groupby([ 'init_tagger','rules_num'])['accuracy'].agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
init_tagger,rules_num,Unnamed: 2_level_1,Unnamed: 3_level_1
default,50,0.555826,0.006351
default,100,0.605384,0.007219
default,250,0.665612,0.006368
unigram,50,0.844106,0.005358
unigram,100,0.850418,0.005186
unigram,250,0.860227,0.005322


In [None]:
# processs results for CZ data
with open('sample_data/brill_tagger_CZ_results (2).pkl', 'rb') as f:
    cz_results = pickle.load(f)
cz_results
cz_results = cz_results[cz_results['init_tagger'] == 'unigram']
cz_results

Unnamed: 0,accuracy,init_tagger,rules_num,split,smooth_data
0,0.601467,unigram,50,1,False
1,0.608986,unigram,100,1,False
2,0.625379,unigram,250,1,False
3,0.617752,unigram,50,2,False
4,0.623805,unigram,100,2,False
5,0.638275,unigram,250,2,False
6,0.629359,unigram,50,3,False
7,0.632178,unigram,100,3,False
8,0.645291,unigram,250,3,False
9,0.586176,unigram,50,4,False


In [None]:
cz_results.groupby(['init_tagger','rules_num', 'split'])['accuracy'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy
init_tagger,rules_num,split,Unnamed: 3_level_1
unigram,50,0,0.584064
unigram,50,1,0.601467
unigram,50,2,0.617752
unigram,50,3,0.629359
unigram,50,4,0.586176
unigram,100,0,0.591025
unigram,100,1,0.608986
unigram,100,2,0.623805
unigram,100,3,0.632178
unigram,100,4,0.593321


In [None]:
cz_results.groupby([ 'init_tagger','rules_num'])['accuracy'].agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
init_tagger,rules_num,Unnamed: 2_level_1,Unnamed: 3_level_1
unigram,50,0.603763,0.019707
unigram,100,0.609863,0.018177
unigram,250,0.624211,0.01806


#**HMM tagging**

In [None]:
def compute_counts_n_grams(text):
  # compute c3 - counts for trigrams
  c3 = dict()
  for sentence in text:
    sent =  [(None,'<S>'), (None,'<S>')] + sentence + [(None,'<S>'), (None,'<S>')]
    for i in range(len(sent)-2):
      trigram = (sent[i][1], sent[i+1][1],sent[i+2][1])
      c3[trigram] = c3.get(trigram, 0) + 1

  # compute c2 - bigram counts from c3
  c2 = dict()
  for (w_i2, w_i1, w_i), count in c3.items():
      c2[(w_i2, w_i1)] = c2.get((w_i2, w_i1), 0) + count

  #compute c1 from c2
  c1 = dict()
  for (w_i1, w_i), count in c2.items():
    c1[w_i1] = c1.get(w_i1, 0) + count
  return c3, c2, c1

In [None]:
def compute_probs_n_grams(text):
  # compute transition probability

  c3, c2, c1 = compute_counts_n_grams(text)
  p_0 = 1/len(c1)

  text_len = sum(c1.values())
  p_1 = dict(zip(c1.keys(), np.array(list(c1.values())) / text_len))

  p_2 = dict()
  for bigram, count in c2.items():
    w_i1, w_i = bigram
    p_2[(w_i, w_i1)] = count/c1[w_i1]

  p_3 = dict()
  for trigram, count in c3.items():
    w_i2, w_i1, w_i = trigram
    p_3[(w_i, w_i2, w_i1)] = count/c2[(w_i2, w_i1)]

  return p_0, p_1, p_2, p_3

In [None]:
def em_for_tags(probs, held_data):
  # EM algorithm with threshold 10^(-6) for given p0,..,p4 as probs and heldout data held_data
  p0, p1, p2, p3 = probs
  lamb = np.array([0.25,0.25,0.25,0.25])

  for epoch in range(1000000):
    expected_counts = np.zeros(4)
    # for heldout data compute p_lambda an update expected counts of lambdas
    for sentence in held_data:
      for i in range(len(sentence)-2):
        hi2, hi1, wi = sentence[i][1], sentence[i+1][1], sentence[i+2][1]

        p_from_train = np.array([p0, p1.get(wi, 0), p2.get((wi,hi1),0), p3.get((wi,hi2,hi1),0)])
        p_lambda = lamb @ p_from_train

        current_c = np.array([lamb[0] * p0, lamb[1] * p1.get(wi,0), lamb[2] * p2.get((wi,hi1),0), lamb[3] * p3.get((wi,hi2,hi1),0) ]) / p_lambda
        expected_counts += current_c

    # compute lambda_next and check termination condition
    lamb_next = expected_counts / expected_counts.sum()
    if ((epoch + 1) % 100) == 0 :print('number of epochs : ', epoch + 1, 'lambda coefs: ', lamb)

    if (np.abs(lamb - lamb_next) < 10**(-6)).all():
      break
    lamb = lamb_next

  print('number of epochs : ', epoch + 1, 'lambda coefs: ', lamb)
  return lamb

In [None]:
def smooth_prob(prob, lambdas):
  # returns smoothed distribution

  smoothed = [prob[0] * lambdas[0]]
  for i in range(1,len(lambdas)):
    p = prob[i]
    for key in p.keys():
      p[key] = p[key] * lambdas[i]
    smoothed.append(p)
  return smoothed

In [None]:
def em_for_emmission(probs, held_data):
  # EM algorithm with threshold 10^(-6) for given p0,..,p4 as probs and heldout data held_data
  p0, p1, p2 = probs
  lamb = np.array([0.3,0.3,0.4])

  for epoch in range(1000000):
    expected_counts = np.zeros(3)
    # for heldout data compute p_lambda an update expected counts of lambdas
    for sentence in held_data:

      for i in range(len(sentence)-2):
        hi2, hi1, wi = sentence[i][1], sentence[i+1][1], sentence[i+1][0]

        p_from_train = np.array([p0, p1.get(wi, 0), p2.get((wi,hi2,hi1),0)])
        p_lambda = lamb @ p_from_train

        current_c = np.array([lamb[0] * p0, lamb[1] * p1.get(wi,0), lamb[2] * p2.get((wi,hi2,hi1),0) ]) / p_lambda
        expected_counts += current_c

    # compute lambda_next and check termination condition
    lamb_next = expected_counts / expected_counts.sum()
    if ((epoch + 1) % 100) == 0 :print('number of epochs : ', epoch + 1, 'lambda coefs: ', lamb)

    if (np.abs(lamb - lamb_next) < 10**(-6)).all():
      break
    lamb = lamb_next

  print('number of epochs : ', epoch + 1, 'lambda coefs: ', lamb)
  return lamb

In [None]:
def get_out_prob(data):
  # compute emission probabilities
  p2 = dict()
  p1 = dict()
  hist_c = dict()

  for sentence in data:
    sent =  [(None,'<S>')] + sentence
    # compute counts of tag_prev, tag_curr, word_curr
    for i in range(len(sent) - 1):
      tp, tc, wc = sent[i][1], sent[i + 1][1], sent[i + 1][0]
      p2[(wc, tp, tc)] = p2.get((wc, tp, tc), 0) + 1
      hist_c[(tp, tc)] = hist_c.get((tp,tc), 0) + 1
      p1[wc] = p1.get(wc, 0) + 1

  data_length = sum(p1.values())
  # compute probs
  for (wc, tp, tc), count in p2.items():
    p2[(wc, tp, tc)] = count/hist_c[(tp, tc)]
    p1[wc] = p1.get(wc)/( data_length - 1)

  # Computing uniform probability
  vocab_len = len(p1.keys())
  p0 = 1 / vocab_len

  return p0, p1, p2

In [None]:
def prunning_for_viterbi(current_states):
  # prune alphas, keep only 10 best
  sorted_viterbi = sorted(current_states.items(), key=lambda x: x[1]["alpha"])
  pruned_viterbi = current_states.copy()

  for item in sorted_viterbi:
      if len(pruned_viterbi) <= 10:
          break

      pruned_viterbi.pop(item[0])

  return pruned_viterbi


In [None]:
def viterbi(trans_p, out_p, test_data, taglist):
  p0_t,p1_t,p2_t,p3_t = trans_p
  p0_e,p1_e,p2_e = out_p

  correct = 0
  total = 0

  wh = 1
  for test_sentence in test_data:
    if wh % 1000 == 0:
      print(f'processed sentences: {wh} out of {len(test_data)}')
      print(f'accumulated accuracy so far: {correct / total}')
    wh += 1

    seq, tags = [],[]

    for w,t in test_sentence:
      seq.append(w)
      tags.append(t)


    vit = dict()
    vit[0] = dict()
    vit[0]['S', 'S'] = {'alpha': 1.0, 'backpointer': None}

    for i in range(len(seq)):

      vit[i + 1] = dict()

      prev_tags = dict()
      for w,u in list(vit[i].keys()):
        prev_tags[u] = prev_tags.get(u, []) + [w]

      # forward pass
      for new_tag in taglist:
        for u in prev_tags.keys():
          max_score = float('-Inf')
          max_tag = None

          for w in prev_tags[u]:
            emission_prob = p2_e.get((seq[i], u, new_tag), 0) + p1_e.get(seq[i], 0) + p0_e
            transition_prob = p3_t.get((new_tag, w, u), 0) + p2_t.get((new_tag, u), 0) + p1_t.get(new_tag, 0) + p0_t

            # taking log
            emm_log = np.log(emission_prob)
            trans_log = np.log(transition_prob)

            score = vit[i][w,u]['alpha'] + emm_log + trans_log
            if score > max_score:
              max_score = score
              max_tag = w,u

          vit[i + 1][u,new_tag] = {'alpha': max_score, 'backpointer': max_tag}
      vit[i + 1] = prunning_for_viterbi(vit[i + 1])

    # backward pass
    max_prob = float('-inf')
    best_state = None
    w = None
    for st, scores in vit[len(seq) ].items():
        if scores["alpha"] > max_prob:
            max_prob = scores["alpha"]
            best_state = st

    path = [best_state[1]]
    state = best_state

    for i in range(len(seq), 1, -1):
        best_state = vit[i][best_state]['backpointer']
        path.append(best_state[1])

    path.reverse()

    total += len(tags)
    correct +=  sum( [1 for pred_tag, true_tag in zip(path, tags) if pred_tag == true_tag] )

  print('total accuracy: ', correct / total)
  return correct / total


In [None]:
def supervided_HMM(data):
  # split data according to assignment
  S,H,T = data[-40000:], data[-60000: -40000], data[:-60000]

  S_sentences = split_to_sentences(S)
  H_sentences = split_to_sentences(H)
  T_sentences = split_to_sentences(T)

  # get transition prob from train sentences smooth on H sentences
  trans_prob = compute_probs_n_grams(T_sentences)
  print(f'EM for transition probabilities -----------')
  lambdas_trans = em_for_tags(trans_prob, H_sentences)
  trans_smoothed_prob = smooth_prob(trans_prob, lambdas_trans)

  # get emission probabilities
  out_prob = get_out_prob(T_sentences)
  print(f'EM for emission probabilities ------------')
  lamdas_out = em_for_emmission(out_prob, H_sentences)
  out_smoothed_prob = smooth_prob(out_prob, lamdas_out)

  # evaluate with viterbi
  print('Running viterbi -------------')
  taglist = list(set([x[1] for x in T]))
  total_acc = viterbi(trans_smoothed_prob, out_smoothed_prob, S_sentences, taglist)
  return total_acc


In [None]:
# run supervised HMM for english
supervided_HMM(dataEN)

EM for transition probabilities -----------
number of epochs :  100 lambda coefs:  [5.10452848e-04 1.18780700e-02 2.22067074e-01 7.65544403e-01]
number of epochs :  107 lambda coefs:  [5.10283353e-04 1.18798309e-02 2.22057837e-01 7.65552049e-01]
EM for emission probabilities ------------
number of epochs :  5 lambda coefs:  [1.97219606e-01 2.94435764e-08 8.02780364e-01]
Running viterbi -------------
processed sentences: 1000 out of 1647
accumulated accuracy so far: 0.8573455167522009
total accuracy:  0.8572992986207076


(['JJ', 'NN', 'VBZ', 'DT', 'NNS', 'IN', 'JJ', 'NNS', '.'], 0.8572992986207076)

In [None]:
# run supervised HMM for Czech
supervided_HMM(dataCZ)

EM for transition probabilities -----------
number of epochs :  45 lambda coefs:  [0.01457698 0.06348909 0.60791964 0.31401429]
EM for emission probabilities ------------
number of epochs :  5 lambda coefs:  [3.99825128e-01 2.74846076e-07 6.00174597e-01]
Running viterbi -------------
processed sentences: 1000 out of 2363
accumulated accuracy so far: 0.6080609514947074
processed sentences: 2000 out of 2363
accumulated accuracy so far: 0.6154898654023611
total accuracy:  0.6149370317232584


(['Db-------------',
  'RR--7----------',
  'NNFS7-----A----',
  'J^-------------',
  'AAIS2----1A----',
  'NNIS2-----A----',
  'RR--4----------',
  'Ca--4----------',
  'NNIP2-----A----',
  'Z:-------------'],
 0.6149370317232584)

#**Baum welch unsupervised training**

In [None]:
def compute_alphas(sentence, trans_p, out_p, taglist):
  sent = ['S'] + sentence

  p0_t,p1_t,p2_t,p3_t = trans_p
  p0_e,p1_e,p2_e = out_p

  alphas = dict()
  alphas[0] = dict()
  alphas[0]['S', 'S'] = 1

  norm_factors = [None for i in range(len(sent))]
  norm_factors[0] = 1

  for i in range(len(sent) - 1):
    alphas[i + 1] = dict()
    if i == 0:
      prevprev_tags = ["S"]
      prev_tags = ["S"]
    elif i == 1:
      prevprev_tags = ["S"]
      prev_tags = taglist
    else:
      prevprev_tags = prev_tags = taglist

    whole_stage_alpha = 0
    for u in prev_tags:
      for new_tag in taglist:
        current_alpha = 0


        # emission p(w | (previous_tag, current_tag))
        emm_prob = p2_e.get((sent[i + 1], u,new_tag), 0) + p1_e.get(sent[i + 1], 0) + p0_e

        for w in prevprev_tags:
          # transition p
          trans_prob = p3_t.get((new_tag, w, u), 0) + p2_t.get((new_tag, u), 0) + p1_t.get(new_tag,0) + p0_t
          current_alpha += alphas[i].get((w,u)) * emm_prob * trans_prob

        if current_alpha != 0:
          alphas[i + 1][u,new_tag] = current_alpha
          whole_stage_alpha += current_alpha

    # normalisation
    for key in alphas[i + 1].keys():
      alphas[i + 1][key] = alphas[i + 1][key] / whole_stage_alpha

    norm_factors[i+1] = whole_stage_alpha

  return alphas, norm_factors

In [None]:
def compute_betas(sentence, trans_p, out_p, taglist, norm_factors):
  p0_t,p1_t,p2_t,p3_t = trans_p
  p0_e,p1_e,p2_e = out_p

  sent = ['S'] + sentence

  beta = dict()
  for t in range(len(sent)):
      beta[t] = dict()

  for prev_t in taglist:
    for current_t in taglist:
      beta[len(sent) - 1][prev_t, current_t] = 1

  for i in range(len(sent) - 2, -1, -1):
    if i == 0 or i == 1:
        prev_tags = ["S"]
    else:
        prev_tags = taglist
    if i == 0:
        cur_tags = ["S"]
    else:
        cur_tags = taglist

    for prev_t in prev_tags:
      for current_t in cur_tags:
        current_beta = 0
        for next_tag in taglist:
          # emm_prob
          emm_prob = p2_e.get((sent[i+1], current_t, next_tag), 0) + p1_e.get(sent[i+1], 0) + p0_e

          # trans_prob
          trans_prob = p3_t.get((next_tag, prev_t, current_t), 0) + p2_t.get((next_tag, current_t),0) + p1_t.get(next_tag,0) + p0_t

          #print('probs:', emm_prob, trans_prob)
          current_beta += beta[i+1].get((current_t, next_tag), 0) * emm_prob * trans_prob



          # the corresponding alpha (as stated in slide 185)
          if current_beta != 0:
              #beta_step_t += current_beta
              beta[i][prev_t, current_t] = current_beta / norm_factors[i]
  return beta



In [None]:
def compute_expected_counts(sentence,alphas, betas, trans_p, out_p):
  #print('expected counts')
  sent = ['S'] + sentence

  p0_t,p1_t,p2_t,p3_t = trans_p
  p0_e,p1_e,p2_e = out_p

  c_yss = dict()
  c_ss = dict()
  c_s = dict()

  for i in range(len(sentence)):
    for next_t, prev_t, cur_t in p3_t.keys():
      if alphas[i].get((prev_t, cur_t)) and betas[i+1].get((cur_t, next_t)):

        trans_prob = p3_t.get((next_t, prev_t, cur_t), 0) + p2_t.get((next_t, cur_t), 0) + p1_t.get(next_t,0) + p0_t

        emm_prob = p2_e.get((sent[i+1], cur_t, next_t), 0) + p1_e.get(sent[i+1], 0) + p0_e

        inc = alphas[i][prev_t, cur_t] * trans_prob * emm_prob * betas[i + 1][cur_t, next_t]

        c_yss[(prev_t, cur_t, next_t, sent[i])] = c_yss.get((prev_t, cur_t, next_t, sent[i]), 0) + inc
        c_ss[(prev_t, cur_t, next_t)] = c_ss.get((prev_t, cur_t, next_t), 0) + inc
        c_s[(prev_t, cur_t)] = c_s.get((prev_t, cur_t), 0) + inc


  return c_yss, c_ss, c_s


In [None]:
# baum welch algorithm for unsupervide prob estimation
def baum_welch(trans_p, out_p, unsupervised_data, taglist):
  for epoch in range(1):

    # Initialize accumulated counts
    total_c_yss = dict()
    total_c_ss = dict()
    total_c_s = dict()

    log_likelihood = 0
    for i in range(len(unsupervised_data)):
      if i % 100 == 0:
        print(f'processed sentences: {i} out of {len(unsupervised_data)}')

      sentence = unsupervised_data[i]

      # compute alphas, betas
      alphas, norm_factors = compute_alphas(sentence, trans_p, out_p, taglist)
      log_likelihood += np.log(sum(alphas[len(sentence)].values()))

      betas = compute_betas(sentence, trans_p, out_p, taglist, norm_factors)

      # compute probability updates
      c_yss, c_ss, c_s = compute_expected_counts(sentence, alphas, betas, trans_p, out_p)

      for k in c_yss:
          total_c_yss[k] = total_c_yss.get(k, 0) + c_yss[k]
      for k in c_ss:
          total_c_ss[k] = total_c_ss.get(k, 0) + c_ss[k]
      for k in c_s:
          total_c_s[k] = total_c_s.get(k, 0) + c_s[k]



    biggest_change = 0
    # update transition prob
    for (prev_t, cur_t, next_t) in total_c_ss:
      new_trans_prob = total_c_ss[(prev_t, cur_t, next_t)] / total_c_s[(prev_t, cur_t)]

      trans_diff = abs(trans_p[3].get((next_t, prev_t, cur_t), 0) - new_trans_prob)
      trans_p[3][(next_t, prev_t, cur_t)] = new_trans_prob

      if trans_diff > biggest_change:
        biggest_change = trans_diff


    # update emission prob according to slide 189
    for (prev_t, cur_t, next_t, word) in total_c_yss:
      new_emm_prob = total_c_yss[(prev_t, cur_t, next_t, word)] / total_c_ss[(prev_t, cur_t, next_t)]
      emm_diff = abs(out_p[2].get((word, cur_t, next_t), 0) - new_emm_prob)
      out_p[2][(word, cur_t, next_t)] = new_emm_prob


      if emm_diff > biggest_change:
        biggest_change = emm_diff


    print(f'epoch: {i}, biggest_change:{biggest_change}')
    print(f'log likelihood: {log_likelihood}')

    if biggest_change < 10**(-6):
      print('Convergence')
      break
  return trans_p, out_p


In [None]:
def unsupervised_baum_welch(data):
  # split data according to assignment
  S,H,T = data[-40000:], data[-60000: -40000], data[:-60000]

  S_sentences = split_to_sentences(S)
  H_sentences = split_to_sentences(H)

  supervised_T = split_to_sentences(T[:10000])

  unsupervised_T_sent = split_to_sentences(T[10000:])
  unsupervised_T = []

  # remove tags
  for sentence in unsupervised_T_sent:
    untag_sentence = []
    for word, tag in sentence:
      untag_sentence.append(word)
    unsupervised_T.append(untag_sentence)

  print(f'supervised training: ----------------- \n')
  # transition probs
  trans_prob = compute_probs_n_grams(supervised_T)
  print(f'EM for transition probabilities -----------')
  lambdas_trans = em_for_tags(trans_prob, H_sentences)
  trans_smoothed_prob = smooth_prob(trans_prob, lambdas_trans)

  # emission probs
  out_prob = get_out_prob(supervised_T)
  print(f'EM for emission probabilities ------------\n')
  lamdas_out = em_for_emmission(out_prob, H_sentences)
  out_smoothed_prob = smooth_prob(out_prob, lamdas_out)

  print(f'running baum-welch: -----------------')
  taglist = list(set([x[1] for x in T]))

  print('Running baum welch \n')
  new_trans_p, new_out_p = baum_welch(trans_smoothed_prob, out_smoothed_prob, unsupervised_T, taglist)

  # smooth result distributions distributions
  print(f'smoothing obtained distributions \n')
  lambdas_trans = em_for_tags(new_trans_p, H_sentences)
  new_trans_smoothed_prob = smooth_prob(new_trans_p, lambdas_trans)

  lamdas_out = em_for_emmission(new_out_p, H_sentences)
  new_out_smoothed_prob = smooth_prob(new_out_p, lamdas_out)

  with open('sample_data/baum_welch_EN_trans.pkl', 'wb') as f:
    print('save')
    en_results = pickle.dump(new_trans_smoothed_prob, f)

  with open('sample_data/baum_welch_EN_out.pkl', 'wb') as f:
    en_results = pickle.dump(new_out_smoothed_prob,f)

  print('Running viterbi ------------- \n')
  total_acc = viterbi(trans_smoothed_prob, out_smoothed_prob, S_sentences, taglist)


In [None]:
# this cells took around 9 hours and only half of sentences was processed then the relation crashed
unsupervised_baum_welch(dataEN)

supervised training: ----------------- 

EM for transition probabilities -----------
number of epochs :  77 lambda coefs:  [0.04462865 0.03952366 0.62552398 0.29032371]
EM for emission probabilities ------------

number of epochs :  6 lambda coefs:  [4.94386421e-01 3.08858097e-07 5.05613271e-01]
running baum-welch: -----------------
Running baum welch 

processed sentences: 0 out of 7166
processed sentences: 100 out of 7166
processed sentences: 200 out of 7166
processed sentences: 300 out of 7166
processed sentences: 400 out of 7166
processed sentences: 500 out of 7166
processed sentences: 600 out of 7166
processed sentences: 700 out of 7166
processed sentences: 800 out of 7166
processed sentences: 900 out of 7166
processed sentences: 1000 out of 7166
processed sentences: 1100 out of 7166
processed sentences: 1200 out of 7166
processed sentences: 1300 out of 7166
processed sentences: 1400 out of 7166
processed sentences: 1500 out of 7166
processed sentences: 1600 out of 7166
processed 