Preprocessing
- Decapitalise everything. Either:
    - lowercase every token after a '.' token or
    - lowercase everything
    - I go with 2nd option because there are other punctuation marks (some of which are quotes) and I don't want to do SBD
    - I need to distinguish between UNKNOWNs and NAMEs which have tag Propernoun

The plan:

- Write script to list out all tokens with less than 0.0001N counts
- Write script to replace all tokens with less than 0.0001N counts with <UNKNOWN>
- Separate the sentences
- Get the counts (according to slide 14/39). Sum over each sentence
    - Transition count of each state pair. Emission count of each state-token pair.
- Estimate original values using slide 14/39
** Separate the sentences into 90-10 split for training and evaluation
- Online EM learn
    - Stepwise EM or whatever the fuck it's called
    - μ=(1−ηk​)μ+ηk​μ′
    - ηk is simply a step statistic: 1/(k+1)^a for iteration k
    - I guess we set alpha to 0.7
** Viterbi EM may not even be necessary? But we can implement it afterwards if it takes too long

In [None]:
import pandas as pd

# Path to your CoNLL-U file
file_path = 'ptb-train.conllu'

# Initialize list to store rows, sentence counter, and token counter
data = []
sentence_id = 0
token_id = 0

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if not line:  # Empty line indicates new sentence
            sentence_id += 1
            token_id = 0  # Reset token counter for the new sentence
        else:
            token_id += 1  # Increment token id for each token in a sentence
            parts = line.split('\t')
            if len(parts) == 10:
                data.append([sentence_id] + parts)

# Create a DataFrame
df = pd.DataFrame(data, columns=['sentence_id', 'id', 'form', 'blank', 'upos', 'xpos', 'blank', 'head', 'deprel', 'deps', 'blank'])

# Save the DataFrame to a CSV file
csv_path = 'ptb-train.csv'
df.to_csv(csv_path, index=False)

print(f"Data saved to {csv_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('ptb-train.csv')  
column_name = 'form'
forms = df[column_name]

# Calculate frequency distribution of words
frequency = forms.value_counts()

# Plot the frequency distribution
plt.figure(figsize=(10, 6))
frequency.plot(kind='line', logy=True)  # log scale for better visibility
plt.title('Frequency Distribution of Words')
plt.xlabel('Words')
plt.ylabel('Frequency (log scale)')
plt.grid(True)
plt.show()

# Output the frequency distribution
print(frequency)

# Check the result or save the updated DataFrame
print(df.head())  # Prints the first few rows of the updated DataFrame
# df.to_csv('updated_file.csv', index=False)  # Uncomment to save the updated DataFrame


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('ptb-train.csv', keep_default_na=False, na_values=[''])
column_name = 'form'
upos_column = 'upos'

# Lowercase everything
df[column_name] = df[column_name].str.lower()

# Calculate frequency distribution of words
counts = df[column_name].value_counts()

# Apply 'UNKNOWN' or 'NAME' based on frequency and whether the word is a proper noun
threshold = 4
df[column_name] = df.apply(
    lambda row: 'NUM' if row[upos_column] == 'NUM' else
                ('NAME' if row[upos_column] == 'PROPN' else 
                ('UNKNOWN' if counts.get(row[column_name], 0) < threshold else row[column_name])),
    axis=1
)


output = f'ptb-train-{threshold}-all-lower.csv'
# Save to new csv
df.to_csv(output, index=False)
print(f"Updated DataFrame (all lowercase) saved to {output}")


In [1]:
import pandas as pd

def process_sentences(df: pd.DataFrame, id_col = 'id', forms_col = 'form', pos_tags_col = 'upos'):
    output = []
    current_sentence = []
    current_pos = []

    for index, row in df.iterrows():
        if row[id_col] == 1 and current_sentence:
            output.append((current_sentence, current_pos))
            current_sentence, current_pos = [row[forms_col]], [row[pos_tags_col]]
        else:
            current_sentence.append(row[forms_col])
            current_pos.append(row[pos_tags_col])

    # Append the last sentence if it's not empty
    if current_sentence:
        output.append((current_sentence, current_pos))

    return output


def write_sentences_to_file(sentences, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for sentence, pos_tags in sentences:
            # Writing words and POS tags on separate lines
            file.write('Words: ' + ' '.join(sentence) + '\n')
            file.write('Tags: ' + ' '.join(pos_tags) + '\n')
            file.write('\n')  # Adding a blank line between sentences for clarity

df = pd.read_csv('ptb-train-4-all-lower.csv')
processed = process_sentences(df)
write_sentences_to_file(processed, 'output_sentences.txt')



In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('ptb-train-4-all-lower.csv')
processed_sentences, processed_tags = [x[0] for x in processed], [x[1] for x in processed]

unique_words = sorted(df['form'].unique())
unique_words_dict = {w: i for (i, w) in enumerate(unique_words)}
unique_upos = sorted(df['upos'].unique())
unique_upos_dict = {t: i for (i, t) in enumerate(unique_upos)}

M = len(unique_words)
N = len(unique_upos)

print(unique_words)
print(unique_upos)


def make_transition_matrix(epsilon=1):

    transition_matrix = np.zeros((N, N))
    transition_counts = {key: {k: 0 for k in unique_upos_dict.keys()} for key in unique_upos_dict.keys()}

    for sequence in processed_tags:
        for i in range(len(sequence) - 1):
            current_tag = sequence[i]
            next_tag = sequence[i + 1]
            if current_tag in unique_upos_dict and next_tag in unique_upos_dict:
                transition_counts[current_tag][next_tag] += 1

    for a in unique_upos_dict:
        total_transitions_from_a = sum(transition_counts[a].values()) + epsilon * N

        for b in unique_upos_dict:
            transition_matrix[unique_upos_dict[a], unique_upos_dict[b]] = (transition_counts[a][b] + epsilon) / total_transitions_from_a

    print(transition_matrix)
    return transition_matrix

def make_emission_matrix(epsilon = 1):

    emission_matrix = np.zeros((N, M))
    emission_counts = {key: {tag: 0 for tag in unique_words_dict.keys()} for key in unique_upos_dict.keys()}

    for j in range(len(processed_tags)):
        sentence = processed_sentences[j]
        sequence = processed_tags[j]
        for i in range(len(sequence)):
            current_tag = sequence[i]
            current_emission = sentence[i]
            if current_tag in unique_upos_dict and current_emission in unique_words_dict:
                emission_counts[current_tag][current_emission] += 1

    for a in unique_upos_dict:
        total_emissions_from_a = sum(emission_counts[a].values()) + epsilon * M
        for b in unique_words_dict:
            emission_matrix[unique_upos_dict[a], unique_words_dict[b]] = (emission_counts[a][b] + epsilon) / total_emissions_from_a

    print(emission_matrix)
    return emission_matrix

def make_initial(epsilon=0):
    initial_probabilities = np.zeros(len(unique_upos_dict))
    initial_counts = {tag: 0 for tag in unique_upos_dict.keys()}
    total_sentences = len(processed_tags)

    for tags in processed_tags:
        if tags:  # Check it exists
            initial_tag = tags[0]
            if initial_tag in initial_counts:
                initial_counts[initial_tag] += 1

    # Convert counts to probabilities
    for tag, index in unique_upos_dict.items():
        initial_probabilities[index] = (initial_counts[tag] + epsilon) / total_sentences

    return initial_probabilities

t_matrix, e_matrix, initial = make_transition_matrix(), make_emission_matrix(), make_initial()


In [None]:
print(t_matrix[unique_upos_dict['NOUN']][unique_upos_dict['ADV']])
# NOUN -> ADV 0.02721837801995658

In [46]:
print(t_matrix.shape)
print(e_matrix.shape)

(17, 17)
(17, 9962)


In [7]:
# OUTDATED IMPLEMENTATIONS, NOT IN LOG SPACE (and not vectorised)

# # No end probabilities, so I just don't calculate them
# def forward(tags, word_sequence, initial, words_dict, transition, emission):
#     # node values stored during forward algorithm
#     node_values_fwd = np.zeros((len(tags), len(word_sequence)))

#     # i is index of observed sequence, k is across hidden states
#     for i, word in enumerate(word_sequence):
#         for j in range(len(tags)):
#             # if first sequence value
#             if (i == 0):
#                 node_values_fwd[j, i] = initial[j] * emission[j, words_dict[word]]
#             else:
#                 values = [node_values_fwd[k, i - 1] * emission[j, words_dict[word]] 
#                           * transition[k, j] for k in range(len(tags))]
#                 node_values_fwd[j, i] = sum(values)

#     forward_val = sum(node_values_fwd[:, -1])
#     return node_values_fwd, forward_val


# # No end probabilities, so I assume the prob of landing on any hidden state last is 1
# def backward(tags, word_sequence, initial, words_dict, transition, emission):
#     # node values stored during forward algorithm
#     backward_vals = np.zeros((len(tags), len(word_sequence)))

#     #for i, sequence_val in enumerate(test_sequence):
#     for i in range(1,len(word_sequence)+1):
#         for j in range(len(tags)):
#             # if first sequence value then do this
#             if (-i == -1):
#                 backward_vals[j, -i] = 1
#             else:
#                 values = [backward_vals[k, -i+1] * emission[k, words_dict[word_sequence[-i+1]]] * transition[j, k] for k in range(len(tags))]
#                 backward_vals[j, -i] = sum(values)

#     start_state = [backward_vals[m,0] * emission[m, words_dict[word_sequence[0]]] for m in range(len(tags))]
#     start_state = np.multiply(start_state, initial)
#     backward_val = sum(start_state)
#     return backward_vals, backward_val


# #function to find si probabilities
# def si_probs(states, word_sequence, forward, backward, forward_val, words_dict, transition, emission):

#     si_probabilities = np.zeros((len(states), len(word_sequence)-1, len(states)))

#     # i is observed state index
#     # We are going from hidden state indexes j to k, at times i to i+1
#     for i in range(len(word_sequence)-1):
#         for j in range(len(states)):
#             for k in range(len(states)):
#                 si_probabilities[j,i,k] = ( forward[j,i] * backward[k,i+1] * transition[j,k] * emission[k,words_dict[word_sequence[i+1]]] ) / forward_val
#     return si_probabilities

# #function to find gamma probabilities
# # forward_val 
# def gamma_probs(tags, test_sequence, forward, backward, forward_val):

#     gamma_probabilities = np.zeros((len(tags), len(test_sequence)))

#     for i in range(len(test_sequence)):
#         for j in range(len(tags)):
#             gamma_probabilities[j, i] = (forward[j, i] * backward[j, i]) / forward_val

#     return gamma_probabilities


# def baum(target_sequence, transition, emission, initial, tags, tags_dict, words, words_dict):

#     fwd_probs, fwd_val = forward(tags, target_sequence, initial, words_dict, transition, emission)
#     bwd_probs, bwd_val = backward(tags, target_sequence, initial, words_dict, transition, emission)
#     si_probabilities = si_probs(tags, target_sequence, fwd_probs, bwd_probs, fwd_val, words_dict, transition, emission)
#     gamma_probabilities = gamma_probs(tags, target_sequence, fwd_probs, bwd_probs, fwd_val)
#     word_indices = np.array([words_dict[word] for word in target_sequence])

#     #caclculating 'a' and 'b' matrices
#     a = np.zeros((len(tags), len(tags)))
#     b = np.zeros((len(tags), len(words_dict)))

#     #'a' matrix
#     for j in range(len(tags)):
#         for i in range(len(tags)):
#             for t in range(len(target_sequence)-1):
#                 a[j,i] = a[j,i] + si_probabilities[j,t,i]

#             denom_a = [si_probabilities[j, t_x, i_x] for t_x in range(len(target_sequence) - 1) for i_x in range(len(tags))]
#             denom_a = sum(denom_a)

#             if (denom_a == 0):
#                 a[j,i] = 0
#             else:
#                 a[j,i] = a[j,i]/denom_a


#     #'b' matrix
#     for j in range(len(tags)):
#         for i in range(len(words)): 
#             indices = [idx for idx, val in enumerate(target_sequence) if val == words[i]]
#             numerator_b = sum( gamma_probabilities[j,indices] )
#             denomenator_b = sum( gamma_probabilities[j,:] )

#             if (denomenator_b == 0):
#                 b[j,i] = 0
#             else:
#                 b[j, i] = numerator_b / denomenator_b


#     print('\nMatrix a:\n')
#     print(np.matrix(a.round(decimals=4)))
#     print('\nMatrix b:\n')
#     print(np.matrix(b.round(decimals=4)))

#     return a, b

In [42]:
from scipy.special import logsumexp
import random

import cProfile

# Transition and Emission are already log
# def forward_log_vec(num_tags, num_words, log_initial, log_transition, log_emission_sentence):
#     # num_words = len(word_indices)
#     log_probs = np.full((num_tags, num_words), -np.inf)  # log(0) = -inf
#     scaling_factors = np.zeros(num_words)
    
#     log_probs[:, 0] = log_initial + log_emission_sentence[:, 0]
#     # NORMALISE
#     # factor0 = logsumexp(log_probs[:, 0])
#     # log_probs[:, 0] -= factor0
#     # scaling_factors[0] = factor0

#     for t in range(1, num_words):
#         log_probs[:, t] = logsumexp(log_probs[:, t-1].reshape(-1, 1) + log_transition, axis=0) + log_emission_sentence[:, t]
#         print(f"{log_probs[:,t]=}")
#         # NORMALISE
#         # factor = logsumexp(log_probs[:, t])
#         # log_probs[:, t] -= factor
#         # scaling_factors[t] = factor
#         # print(f"{np.exp(factor)=}")


#     forward_val = logsumexp(log_probs[:, -1])
#     # forward_val = np.sum(scaling_factors)
#     print(f"{np.exp(forward_val)=}")

#     # print(f"{log_probs=}")
#     return log_probs, forward_val

def forward_log_vec(num_tags, num_words, log_initial, log_transition, log_emission_sentence):
    forward_vals = np.full((num_tags, num_words), -np.inf)
    scaling_factors = np.zeros(num_words)

    forward_vals[:, 0] = log_initial + log_emission_sentence[:, 0]
    # NORMALISE
    factor0 = logsumexp(forward_vals[:, 0])
    forward_vals[:, 0] -= factor0
    scaling_factors[0] = factor0
    
    for t in range(1, num_words):
        # Compute log probabilities
        forward_vals[:, t] = logsumexp(forward_vals[:, t-1][:, np.newaxis] + log_transition, axis=0) + log_emission_sentence[:, t]
        # NORMALISE
        factor = logsumexp(forward_vals[:, t])
        forward_vals[:, t] -= factor
        scaling_factors[t] = factor

        # print(f"{np.sum(np.exp(forward_vals[:, t]))=}")
    # print(f"{forward_vals}")
    
    return forward_vals, -np.sum(scaling_factors)

# Implementation using scaling factors from forward algorithm
# def backward_log_vec(num_tags, num_words, log_transition, log_emission_sentence, scaling_factors):
#     # num_words = len(word_indices)
#     backward_vals = np.zeros(num_tags, num_words)
    
#     # Last column, deduct scaling factor
#     backward_vals[:, -1] -= scaling_factors[-1]
    
#     for t in range(num_words-2, -1, -1):
#         backward_vals[:, t] = logsumexp(backward_vals[:, t + 1][:, np.newaxis] + log_transition, axis=1) + log_emission_sentence[:, t + 1]
#         # NORMALISE
#         backward_vals[:, t] -= scaling_factors[:, t]
#     # print(scaling_factors)

#     return backward_vals

def backward_log_vec(num_tags, num_words, log_transition, log_emission_sentence):
    # num_words = len(word_indices)
    backward_vals = np.full((num_tags, num_words), -np.inf)
    # Last column, deduct scaling factor
    backward_vals[:, -1] = 0
    backward_vals[:, -1] -= logsumexp(backward_vals[:, -1])
    for t in range(num_words-2, -1, -1):
        backward_vals[:, t] = logsumexp(backward_vals[:, t + 1][:, np.newaxis] + log_transition, axis=1) + log_emission_sentence[:, t + 1]
        # NORMALISE
        backward_vals[:, t] -= logsumexp(backward_vals[:, t])
    # print(f"{backward_vals=}")

    return backward_vals


def log_si_probs_vec(log_forward, log_backward, log_forward_val, log_transition, log_emission_sentences):

    return (log_forward[:, :-1, np.newaxis] +
        log_backward[:, 1:].T[np.newaxis, :] +
        log_transition[:, np.newaxis, :] + 
        log_emission_sentences[:, 1:].T[np.newaxis, :]
        - log_forward_val
    )

    # if np.any(np.isnan(f)):
    #     print("log_si_probs NAAN detected")
    #     print("Forward:", log_forward)
    #     print("Forward shape:", log_forward.shape)
    #     print("Backward:", log_backward)
    #     print("Transition:", log_transition)
    #     print("Emission Sentences:", log_emission_sentences)
    #     print("log_forward_val:", log_forward_val)

    # if np.any(np.isneginf(f)):
    #     print("log_si_probs -inf detected")
    #     print("Forward:", log_forward)
    #     print("Backward:", log_backward)
    #     print("Transition:", log_transition)
    #     print("Emission Sentences:", log_emission_sentences)
    #     print("log_forward_val:", log_forward_val)

    # print(f"log_si_probs = {f}")
    
    # return f

# def log_si_probs_vec(num_tags, word_indices, log_forward, log_backward, log_forward_val, transition, log_emission_sentences):
#     assert np.all(log_forward < 0)
#     assert np.all(log_backward < 0)
#     assert np.all(transition < 0)
#     assert np.all(log_emission_sentences < 0)
#     print(f"{log_forward_val=}")
    
    
#     si_probabilities = np.full((num_tags, len(word_indices)-1, num_tags), -np.inf)

#     for i in range(len(word_indices)-1):
#         for j in range(num_tags):
#             for k in range(num_tags):
#                 si_probabilities[j, i, k] = (
#                     log_forward[j, i] +
#                     log_backward[k, i+1] +
#                     transition[j, k] +
#                     log_emission_sentences[k, i+1] -
#                     log_forward_val
#                 )
#     print(f"{si_probabilities=}")
#     return si_probabilities



# def baum_log(target_sentence, log_transition, log_emission, log_initial, tags, words_dict):

#     word_indices = np.array([words_dict[word] for word in target_sentence])
#     num_tags = len(tags)
#     num_target_words = len(word_indices)
#     num_words = len(words_dict)
#     # print(f'{log_emission.shape=}')
#     # print(f'{type(log_emission)=}')
#     # print(word_indices)
#     # print(f'{log_emission[:, np.array(word_indices)]=}')
#     log_emission_sentence = log_emission[:, word_indices]
#     # print(f'{log_emission_sentence[:, 0].shape=}')
#     # print(f'{log_initial.shape=}')

#     # print(target_sentence)
#     log_fwd_probs, log_fwd_val = forward_log_vec(num_tags, num_target_words, log_initial, log_transition, log_emission_sentence)
#     log_bwd_probs = backward_log_vec(num_tags, num_target_words, log_transition, log_emission_sentence)

#     # log_si_probabilities = log_si_probs_vec(num_tags, word_indices, log_fwd_probs, log_bwd_probs, log_fwd_val, log_transition, log_emission_sentence)
#     log_si_probabilities = log_si_probs_vec(log_fwd_probs, log_bwd_probs, log_fwd_val, log_transition, log_emission_sentence)

#     # No more function for gamma, too simple
#     log_gamma_probabilities = log_fwd_probs + log_bwd_probs - log_fwd_val

#     # a matrix
#     # print(f'{log_si_probabilities.reshape(num_tags, -1).shape=}')
#     # print(f'{log_si_probabilities.shape=}')
#     a1 = logsumexp(log_si_probabilities, axis=1)
#     a2 = logsumexp(log_si_probabilities, axis=(1, 2))[:, np.newaxis]
#     log_a = a1 - a2
#     if np.any(np.isnan(log_a)):
#         print("log_a NAAN detected")
#         print("Expected transitions from state i to j:", a1)
#         print("Expected total transitions from state i:", a2)
#         # print("FORWARD:", copy_fwd)
#         # print("FORWARD SHAPE:", copy_fwd.shape)
#         # print("BACKWARD:", copy_bwd)
#         print("TARGET SENTENCE", target_sentence)
#         print("SI:", log_si_probabilities)
#         print("Old Transition Matrix:", log_transition)

#     # b matrix
#     b_denominator_logs = logsumexp(log_gamma_probabilities, axis=1, keepdims=True)  # Shape: (num_states, 1)
#     # Pray we don't need to handle cases where denominator is -inf
#     assert np.all(~np.isneginf(b_denominator_logs)), "Error: denominator contains -inf values."
#     unique_word_indices, inverse_indices = np.unique(word_indices, return_inverse=True)
#     num_unique_words = len(unique_word_indices)
#     b_numerator_logs = np.full((num_tags, num_unique_words), -np.inf)
#     # np.add.at(numerator_logs, (slice(None), inverse_indices), log_gamma_probabilities)
    # for idx in range(num_unique_words):
    #     mask = (inverse_indices == idx)
    #     b_numerator_logs[:, idx] = logsumexp(log_gamma_probabilities[:, mask], axis=1)

    # log_b = np.full((num_tags, num_words), -np.inf)
    # log_b[:, unique_word_indices] = b_numerator_logs - b_denominator_logs

    # cem[:, unique_ind] = logsumexp([cem[:, unique_ind], current_b_num_logs], axis=)
    

    # print(f"{log_a=}")
    # print(f"{log_b=}")
    # print(f"{log_b[:, unique_word_indices]=}")
    
    # return log_a, log_b

# def baum_welch_stepwise_loggers(sentences, initial_transition, initial_emission, initial_prob, tags, words_dict, alpha=0.7, max_iterations=1000, log_convergence_threshold=np.log(0.01), batch_size=100):
#     log_transition = np.log(initial_transition)
#     log_emission = np.log(initial_emission)
#     log_initial = np.log(initial_prob)

#     assert np.all(~np.isneginf(log_transition)), "Error: transition contains -inf values."
#     assert np.all(~np.isneginf(log_emission)), "Error: emission contains -inf values."
#     assert np.all(~np.isneginf(log_initial)), "Error: initial contains -inf values."

#     iteration = 0
#     converged = False
#     num_sentences = len(sentences)

#     while iteration < max_iterations and not converged:
#         np.random.shuffle(sentences)
#         print(f'Iteration: {iteration}')

#         for i in range(0, num_sentences, batch_size):
#             batch = sentences[i:i + batch_size]
#             batch_log_a = []
#             batch_log_b = []

#             for sentence in batch:
#                 log_a, log_b = baum_log(sentence, log_transition, log_emission, log_initial, tags, words_dict)
#                 batch_log_a.append(log_a)
#                 batch_log_b.append(log_b)

#             # Aggregate updates for the batch
#             log_af = logsumexp(batch_log_a, axis=0) - np.log(len(batch_log_a))
#             log_bf = logsumexp(batch_log_b, axis=0) - np.log(len(batch_log_b))

#             # print(f"{len(batch_log_a)=}")
#             # print(f"{len(batch_log_b)=}")

#             # iteration + 2 because (1 - learning_rate) is 0 when iteration is 0
#             learning_rate = 1 / ((iteration + 2) ** alpha)
#             lr0, lr1 = np.log(learning_rate), np.log(1 - learning_rate)

#             # Immediate EM update per batch using weighted average in log space
#             log_transition = logsumexp([log_transition + lr1, log_af + lr0], axis=0)
#             log_emission = logsumexp([log_emission + lr1, log_bf + lr0], axis=0)

#         prev_transition = np.copy(log_transition) if iteration == 0 else prev_transition
#         prev_emission = np.copy(log_emission) if iteration == 0 else prev_emission

#         # Check convergence
#         if np.max(np.abs(log_transition - prev_transition)) < log_convergence_threshold and \
#            np.max(np.abs(log_emission - prev_emission)) < log_convergence_threshold:
#             converged = True

#         # DEBUGGING PURPOSES: STOP AT 1 ITERATION
#         # if iteration == 1:
#         #     converged = True

#         prev_transition = np.copy(log_transition)
#         prev_emission = np.copy(log_emission)
#         # if iteration % 10 == 0:
#         if iteration in range(10):
#             print(f"{prev_emission=}")
#             print(f"{prev_transition=}")
#         iteration += 1

#     # Convert back to probabilities if needed for interpretation
#     transition = np.exp(log_transition)
#     emission = np.exp(log_emission)

#     return transition, emission

def baum_log(target_sentence, log_transition, log_emission, log_initial, tags, words_dict):

    word_indices = np.array([words_dict[word] for word in target_sentence])
    num_tags = len(tags)
    num_target_words = len(word_indices)
    num_words = len(words_dict)
    # print(f'{log_emission.shape=}')
    # print(f'{type(log_emission)=}')
    # print(word_indices)
    # print(f'{log_emission[:, np.array(word_indices)]=}')
    log_emission_sentence = log_emission[:, word_indices]
    # print(f'{log_emission_sentence[:, 0].shape=}')
    # print(f'{log_initial.shape=}')

    # print(target_sentence)
    log_fwd_probs, log_fwd_val = forward_log_vec(num_tags, num_target_words, log_initial, log_transition, log_emission_sentence)
    log_bwd_probs = backward_log_vec(num_tags, num_target_words, log_transition, log_emission_sentence)
    
    # log_si_probabilities = log_si_probs_vec(num_tags, word_indices, log_fwd_probs, log_bwd_probs, log_fwd_val, log_transition, log_emission_sentence)
    log_si_probabilities = log_si_probs_vec(log_fwd_probs, log_bwd_probs, log_fwd_val, log_transition, log_emission_sentence)

    # No more function for gamma, too simple
    log_gamma_probabilities = log_fwd_probs + log_bwd_probs - log_fwd_val

    a1 = logsumexp(log_si_probabilities, axis=1)
    a2 = logsumexp(log_si_probabilities, axis=(1, 2))[:, np.newaxis]
    b2 = logsumexp(log_gamma_probabilities, axis=1, keepdims=True)
    # assert np.all(~np.isneginf(b2)), "Error: denominator contains -inf values."

    unique_word_indices, inverse_indices = np.unique(word_indices, return_inverse=True)
    num_unique_words = len(unique_word_indices)
    btemp = np.full((num_tags, num_unique_words), -np.inf)
    # np.add.at(numerator_logs, (slice(None), inverse_indices), log_gamma_probabilities)
    for idx in range(num_unique_words):
        mask = (inverse_indices == idx)
        btemp[:, idx] = logsumexp(log_gamma_probabilities[:, mask], axis=1)
    b1 = np.full((num_tags, num_words), -np.inf)
    b1[:, unique_word_indices] = btemp

    return a1, a2, b1, b2


def baum_welch_stepwise_loggers(sentences, initial_transition, initial_emission, initial_prob, tags, words_dict, alpha=0.9, max_iterations=1000, log_convergence_threshold=np.log(0.01), batch_size=400):
    log_transition = np.log(initial_transition)
    log_emission = np.log(initial_emission)
    log_initial = np.log(initial_prob)

    # assert np.all(~np.isneginf(log_transition)), "Error: transition contains -inf values."
    # assert np.all(~np.isneginf(log_emission)), "Error: emission contains -inf values."
    # assert np.all(~np.isneginf(log_initial)), "Error: initial contains -inf values."

    iteration = 0
    converged = False
    num_sentences = len(sentences)

    while iteration < max_iterations and not converged:
        np.random.shuffle(sentences)
        print(f'Iteration: {iteration}')

        for i in range(0, num_sentences, batch_size):
            batch = sentences[i:i + batch_size]
            batch_log_a1 = []
            batch_log_a2 = []
            batch_log_b1 = []
            batch_log_b2 = []

            for sentence in batch:
                a1, a2, b1, b2 = baum_log(sentence, log_transition, log_emission, log_initial, tags, words_dict)
                batch_log_a1.append(a1)
                batch_log_a2.append(a2)
                batch_log_b1.append(b1)
                batch_log_b2.append(b2)

            batch_log_a1 = np.array(batch_log_a1)
            batch_log_a2 = np.array(batch_log_a2)
            batch_log_b1 = np.array(batch_log_b1)
            batch_log_b2 = np.array(batch_log_b2)

            # Aggregate updates for the batch
            log_af = logsumexp(batch_log_a1, axis=0) - logsumexp(batch_log_a2, axis=0)
            log_bf = logsumexp(batch_log_b1, axis=0) - logsumexp(batch_log_b2, axis=0)

            # print(f"{len(batch_log_a)=}")
            # print(f"{len(batch_log_b)=}")

            # iteration + 2 because (1 - learning_rate) is 0 when iteration is 0
            learning_rate = 1 / ((iteration + 2) ** alpha)
            lr0, lr1 = np.log(learning_rate), np.log(1 - learning_rate)

            # Immediate EM update per batch using weighted average in log space
            log_transition = logsumexp([log_transition + lr1, log_af + lr0], axis=0)
            log_emission = logsumexp([log_emission + lr1, log_bf + lr0], axis=0)

        prev_transition = np.copy(log_transition) if iteration == 0 else prev_transition
        prev_emission = np.copy(log_emission) if iteration == 0 else prev_emission

        # Check convergence
        if np.max(np.abs(log_transition - prev_transition)) < log_convergence_threshold and \
           np.max(np.abs(log_emission - prev_emission)) < log_convergence_threshold:
            converged = True

        # DEBUGGING PURPOSES: STOP AT 1 ITERATION
        # if iteration == 1:
        #     converged = True

        if iteration == 10:
            converged = True

        prev_transition = np.copy(log_transition)
        prev_emission = np.copy(log_emission)
        if iteration % 5 == 0:
        # if iteration in range(10):
            print(f"{prev_emission=}")
            print(f"{prev_transition=}")
        iteration += 1

    # Convert back to probabilities if needed for interpretation
    transition = np.exp(log_transition)
    emission = np.exp(log_emission)

    return transition, emission




In [43]:
# Filter out sentences of length 1
# Note: This is AFTER using them to make emission and initial matrices
bad_list = [i for i, p in enumerate(processed_sentences) if len(p) < 4]
[p for i, p in enumerate(processed_sentences) if i not in bad_list]
print(bad_list)
processed_sentences_cleaned = [p for i, p in enumerate(processed_sentences) if i not in bad_list]

transition, emission = baum_welch_stepwise_loggers(processed_sentences_cleaned, t_matrix, e_matrix, initial, unique_upos, unique_words_dict)
# cProfile.run("baum_welch_stepwise_loggers(random.sample(processed_sentences_cleaned, 4000), t_matrix, e_matrix, initial, unique_upos, unique_words_dict, batch_size=400)")

[200, 538, 665, 666, 667, 668, 901, 902, 1007, 1013, 1020, 1321, 1471, 1501, 1544, 1674, 1758, 1851, 1859, 2207, 2255, 2259, 3004, 3041, 3048, 3062, 3072, 3083, 3494, 3516, 3534, 3545, 3550, 3572, 3575, 3578, 3908, 4006, 4227, 4323, 4331, 4659, 4874, 4877, 4880, 4883, 4905, 4916, 5448, 5689, 6925, 7728, 7732, 7735, 7771, 8233, 8392, 8393, 8394, 8395, 8511, 8512, 8574, 8932, 8942, 8953, 9165, 9232, 9253, 9415, 9416, 9417, 9418, 9428, 9444, 9478, 9489, 9501, 9665, 9666, 9978, 10128, 10176, 10444, 10543, 10616, 10622, 10626, 10642, 10645, 12023, 12348, 12401, 12406, 12415, 12430, 12453, 12472, 12479, 12519, 13240, 13271, 13289, 13560, 13753, 13921, 13922, 13923, 14034, 14042, 14045, 14140, 14141, 14142, 14281, 14401, 14478, 14737, 15052, 15064, 15377, 15546, 15548, 15563, 15577, 15580, 15687, 16144, 16427, 16485, 16486, 16489, 16753, 16765, 17375, 17376, 17651, 17772, 17779, 17970, 17976, 17979, 18147, 18499, 18528, 18533, 18536, 18906, 18909, 18912, 18967, 18975, 18995, 19078, 19127, 192

In [None]:

print(list(zip(processed_sentences[:50], processed_tags[:50])))

In [44]:
# NUMERICAL SPACE
print(transition)
print(emission)


[[2.90460004e-20 4.16150215e-57 5.15169403e-53 1.23443056e-42
  1.27287751e-39 2.70207164e-61 1.09153523e-46 2.63703537e-52
  3.24845731e-01 1.87036759e-09 4.39104738e-51 1.45898360e-20
  6.75154267e-01 9.86844931e-58 7.21310322e-49 1.61121775e-51
  8.74150245e-41]
 [1.34993578e-03 1.72339443e-76 1.81058088e-70 8.08699417e-58
  3.07973444e-52 5.38083570e-76 2.30989349e-23 1.10144295e-71
  3.88082349e-01 1.84762681e-05 4.01063378e-65 1.63117878e-22
  6.10549239e-01 5.22326088e-76 6.01021939e-48 5.87799150e-60
  6.01079224e-13]
 [1.30827347e-03 9.46038972e-79 1.27184629e-72 1.86048767e-55
  1.37852035e-50 5.37299965e-80 3.95213422e-26 2.46389099e-74
  3.87890239e-01 2.28485029e-05 4.80801766e-67 1.31030759e-22
  6.10778639e-01 5.17275755e-77 2.11872512e-50 3.71441408e-61
  3.16845865e-14]
 [1.41518658e-03 4.31329252e-75 1.05907756e-68 1.80695129e-52
  6.72218106e-49 3.37425396e-77 6.03728979e-33 3.35194513e-69
  3.79162982e-01 1.03371888e-10 6.27677928e-63 2.42495284e-22
  6.19421832e-01

In [45]:
import numpy as np

def viterbi(observations, num_states, transition_prob, emission_prob):
    V = np.zeros((num_states, len(observations)))
    path = {}
    # first column of V is the transition prob from state 0
    V[:, 0] = transition_prob[0, :]

    for t in range(1, len(observations)):
        for s in range(1, num_states):
            prob = V[:, t - 1] * transition_prob[:, s] * emission_prob[s - 1, unique_words_dict[observations[t]]]
            V[s, t] = np.max(prob)
            path[s, t] = np.argmax(prob)

    optimal_path = []
    last_state = np.argmax(V[:, -1])
    optimal_path.append(last_state)

    print(f"{path=}")

    for t in range(len(observations) - 1, 1, -1):
        last_state = path[last_state, t]
        optimal_path.insert(0, last_state)

    optimal_path.insert(0, 0)

    return optimal_path

# Example usage
# unique_upos_dict
# unique_words_dict
log_transition = np.log(transition)
log_emission = np.log(emission)
sentence = processed_sentences[20]

# Generate tags
tagss = viterbi(sentence, len(unique_upos), transition, emission)
print(list(zip(sentence, tagss)))
# print(tags)
# print(len(tags))
# print(unique_upos_dict)

path={(1, 1): np.int64(8), (2, 1): np.int64(12), (3, 1): np.int64(12), (4, 1): np.int64(12), (5, 1): np.int64(8), (6, 1): np.int64(12), (7, 1): np.int64(8), (8, 1): np.int64(12), (9, 1): np.int64(8), (10, 1): np.int64(12), (11, 1): np.int64(12), (12, 1): np.int64(12), (13, 1): np.int64(12), (14, 1): np.int64(12), (15, 1): np.int64(12), (16, 1): np.int64(12), (1, 2): np.int64(8), (2, 2): np.int64(12), (3, 2): np.int64(12), (4, 2): np.int64(12), (5, 2): np.int64(12), (6, 2): np.int64(12), (7, 2): np.int64(8), (8, 2): np.int64(12), (9, 2): np.int64(8), (10, 2): np.int64(12), (11, 2): np.int64(12), (12, 2): np.int64(12), (13, 2): np.int64(12), (14, 2): np.int64(12), (15, 2): np.int64(12), (16, 2): np.int64(12), (1, 3): np.int64(8), (2, 3): np.int64(12), (3, 3): np.int64(12), (4, 3): np.int64(12), (5, 3): np.int64(8), (6, 3): np.int64(12), (7, 3): np.int64(8), (8, 3): np.int64(12), (9, 3): np.int64(8), (10, 3): np.int64(12), (11, 3): np.int64(12), (12, 3): np.int64(8), (13, 3): np.int64(8),

KeyError: (np.int64(0), 14)

In [None]:
#transition probabilities
t = np.array([[0.8,0.1],
                       [0.1,0.8]])
#Emission probabilities
e = np.array([[0.1,0.2,0.7],
                     [0.7,0.2,0.1]])

#defining states and sequence symbols
states = ['H','C']
states_dic = {'H':0, 'C':1}
sequence_syms = {'1':0,'2':1,'3':2}
sequence = ['1','2','3']

#test sequence
test_sequence = '331122313'
test_sequence = [x for x in test_sequence]

#probabilities of going to end state
end_probs = [0.1, 0.1]
#probabilities of going from start state
start_probs = [0.5, 0.5]

newt, newe = baum_welch_stepwise_loggers([test_sequence], t, e, start_probs, states, sequence_syms, batch_size=50)


In [None]:
print(newt)
print(newe)