# Import and init

In [2]:
import string
import re
import math

from sklearn.model_selection import train_test_split

In [None]:
ACCEPTABLE_CHARS = string.ascii_letters + string.digits + string.punctuation + ' '
ACCEPTABLE_CHARS

In [None]:
PUNCT_CHARS = string.punctuation
PUNCT_CHARS

In [None]:
SENTENCE_END_CHARS = '!.?'

# Data

## Download data

In [None]:
# %%bash
# wget https://raw.githubusercontent.com/csawtelle/udemy-machine-learning-examples/refs/heads/master/hmm_class/edgar_allan_poe.txt
# wget https://raw.githubusercontent.com/csawtelle/udemy-machine-learning-examples/refs/heads/master/hmm_class/robert_frost.txt


## Read

### Edgar Allan Poe

In [None]:
with open('edgar_allan_poe.txt', 'r') as f:
    c1_0 = f.readlines()

c1_0


In [None]:
def process_txt(txt: list[str]
                ) -> str:
    # Strip leading and trailing newlines
    txt_proc = [i.strip().lower() for i in txt]
    # remove nonsencical lines
    txt_proc = [
        i for i in txt_proc 
        if len(
            set(i).difference(set(ACCEPTABLE_CHARS))
        ) == 0
    ]
    # Join with whitespace
    txt_proc = ' '.join(txt_proc)
    # Remove some illegal characters
    for i in ('"', '(', ')'):
        txt_proc = txt_proc.replace(i, '')
    # Add whitespace padding to sentence-end characters and other punctuation
    for i in (SENTENCE_END_CHARS + ','):
        txt_proc = txt_proc.replace(i, f" {i} ")
    # Replace 2 or more whitespaces with only one
    txt_proc = re.sub(r'\s+', ' ', txt_proc)
    return txt_proc

c1_1 = process_txt(c1_0)
c1_1

In [None]:
c1

### Robert Frost

In [None]:
with open('robert_frost.txt', 'r') as f:
    c2_0 = f.readlines()

c2_0

In [None]:
c2_1 = process_txt(c2_0)
c2_1

In [None]:
vocab = c1_1 + ' ' + c2_1
vocab = list(set(vocab.split(' ')))
vocab

# MM

## Create ISD (Pi)

In [None]:
def get_unique_tokens(txt: str
                      ) -> dict:
    return set(txt.split(' '))

get_unique_tokens(c1_1)

In [None]:
def return_dict_ISD(txt: str
                    ) -> dict:
    # Get set of all unique tokens
    # unique_tokens = get_unique_tokens(txt)
    unique_tokens = vocab
    # initialise Add-One Smoothing dictionary 
    dict_start = {i: 1 for i in unique_tokens}
    # Get a list of initial words
    initial_words0 = re.findall(
        r'[.!] ?([a-zA-Z0-9\-]+)',
        txt
    )
    initial_words = [i for i in initial_words0 if i != '-']
    # Count each one
    for i in set(initial_words):
        if i not in dict_start:
            dict_start[i] = initial_words.count(i)
        else:
            dict_start[i] += initial_words.count(i)
    # Normalise and log-probability
    for i in dict_start:
        dict_start[i] /= len(initial_words0) + len(set(initial_words))
    for i in dict_start:
        dict_start[i] = math.log(dict_start[i], 10)
    # Sort the dictionary based on value
    dict_start = dict(sorted(dict_start.items(), key = lambda item: item[1], reverse = True))
    return dict_start

c1_isd = return_dict_ISD(c1_1)
c1_isd

In [None]:
c1_isd['writer']

In [None]:
c2_isd = return_dict_ISD(c2_1)
c2_isd

In [None]:
for i in (c1_isd, c2_isd):
    print(min(i.values()), max(i.values()))



## Create STT (A)

In [None]:
vocab.index('writer')

In [None]:
c1_1.split(' ')

In [None]:
def create_dict_STT(txt: str
                    ) -> dict:
    dict_stt = {i: {j: 1 for j in vocab} for i in vocab}
    train_corpus = txt.split(' ')
    for index in range(len(train_corpus) - 1):
        from_word = train_corpus[index]
        to_word   = train_corpus[index+1]
        dict_stt[from_word][to_word] += 1
    # count dict
    dict_counts = {i: train_corpus.count(i) + len(set(train_corpus)) for i in vocab}
    # Divide by counts
    for i in dict_counts:
        subdict = dict_stt[i]
        subdict = {j: math.log(subdict[j] / dict_counts[i], 10) for j in subdict}
        dict_stt[i] = subdict
    return dict_stt

c1_stt = create_dict_STT(c1_1)
c1_stt

In [None]:
c1_stt['writer']['of']

In [None]:
c2_stt = create_dict_STT(c2_1)
for i in ['throve', 'sake', 'of']:
    print(c2_stt['writer'][i])


In [None]:
c2_stt['writer']['of']

In [None]:
c2_1.split(' ').count('writer')

In [None]:
c2_stt

# Classify sentence

In [None]:
snt1 = ['Not long ago, the writer of these lines, In the mad pride of intellectuality.']

snt1_proc = process_txt(snt1).split(' ')
snt1_proc


In [None]:
proba_c1 = 0

proba_c1 += c1_isd[snt1_proc[0]]
for i in range(1, len(snt1_proc) - 1):
    word, next_word = snt1_proc[i], snt1_proc[i+1]
    proba = c1_stt[word][next_word]
    proba_c1 += proba
    print(proba, word, next_word)
proba_c1

In [None]:
proba_c2 = 0

proba_c2 += c2_isd[snt1_proc[0]]
for i in range(1, len(snt1_proc) - 1):
    word, next_word = snt1_proc[i], snt1_proc[i+1]
    proba = c2_stt[word][next_word]
    proba_c2 += proba
    print(proba, word, next_word)
proba_c2
