# Import and init

Binary classification:
- `0`: Edgar Allan Poe
- `1`: Robert Frost

In [None]:
import string
import re
import math

import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
ACCEPTABLE_CHARS = string.ascii_letters + string.digits + string.punctuation + ' '
ACCEPTABLE_CHARS

In [None]:
PUNCT_CHARS = string.punctuation
PUNCT_CHARS

In [None]:
SENTENCE_END_CHARS = '!.?'

# Data

## Download data

In [None]:
# %%bash
# wget https://raw.githubusercontent.com/csawtelle/udemy-machine-learning-examples/refs/heads/master/hmm_class/edgar_allan_poe.txt
# wget https://raw.githubusercontent.com/csawtelle/udemy-machine-learning-examples/refs/heads/master/hmm_class/robert_frost.txt


## Read

### Edgar Allan Poe

In [None]:
with open('edgar_allan_poe.txt', 'r') as f:
    c1_0 = f.readlines()

c1_0


In [None]:
def process_txt(txt: list[str]
                ) -> str:
    # Strip leading and trailing newlines
    txt_proc = [i.strip().lower() for i in txt]
    # remove nonsencical lines
    txt_proc = [
        i for i in txt_proc 
        if len(
            set(i).difference(set(ACCEPTABLE_CHARS))
        ) == 0
    ]
    # Join with whitespace
    txt_proc = ' '.join(txt_proc)
    # Remove some illegal characters
    for i in ('"', '(', ')'):
        txt_proc = txt_proc.replace(i, '')
    # Add whitespace padding to sentence-end characters and other punctuation
    for i in (SENTENCE_END_CHARS + ','):
        txt_proc = txt_proc.replace(i, f" {i} ")
    # Replace 2 or more whitespaces with only one
    txt_proc = re.sub(r'\s+', ' ', txt_proc)
    return txt_proc

c1_1 = process_txt(c1_0)
c1_1

In [None]:
# Let's split into sentences and count
c1_1_sentences = [i.strip() for i in c1_1.split('.')]
print(len(c1_1_sentences))
print(c1_1_sentences[:3])

c1_1_train, c1_1_test = train_test_split(
    c1_1_sentences,
    test_size = 0.15,
    random_state = 32
)
print(len(c1_1_train), len(c1_1_test))

c1_1_train = ' . '.join(c1_1_train)
c1_1_train

### Robert Frost

In [None]:
with open('robert_frost.txt', 'r') as f:
    c2_0 = f.readlines()

c2_0[:3]

In [None]:
c2_1 = process_txt(c2_0)
c2_1

In [None]:
# Let's split into sentences and count
c2_1_sentences = [i.strip() for i in c2_1.split('.')]
print(len(c2_1_sentences))
print(c2_1_sentences[:3])

c2_1_train, c2_1_test = train_test_split(
    c2_1_sentences,
    test_size = 0.05,
    random_state = 32
)
print(len(c2_1_train), len(c2_1_test))

c2_1_train = ' . '.join(c2_1_train)
c2_1_train

## Test set

In [None]:
x_test = c1_1_test + c2_1_test
print(len(x_test))

In [None]:
y_test = [0 for _ in range(len(c1_1_test))] + [1 for _ in range(len(c2_1_test))]
print(len(y_test))
y_test[:5]


## Combined vocab

In [None]:
vocab = c1_1_train + ' ' + c2_1_train
vocab = list(set(vocab.split(' ')))
vocab

# MM

## Create ISD (Pi)

In [None]:
def get_unique_tokens(txt: str
                      ) -> dict:
    return set(txt.split(' '))

get_unique_tokens(c1_1_train)

In [None]:
def return_dict_ISD(txt: str
                    ) -> dict:
    # Get set of all unique tokens
    # unique_tokens = get_unique_tokens(txt)
    unique_tokens = vocab
    # initialise Add-One Smoothing dictionary 
    dict_start = {i: 1 for i in unique_tokens}
    # Get a list of initial words
    initial_words0 = re.findall(
        r'[.!] ?([a-zA-Z0-9\-]+)',
        txt
    )
    initial_words = [i for i in initial_words0 if i != '-']
    # Count each one
    for i in set(initial_words):
        if i not in dict_start:
            dict_start[i] = initial_words.count(i)
        else:
            dict_start[i] += initial_words.count(i)
    # Normalise and log-probability
    for i in dict_start:
        dict_start[i] /= len(initial_words0) + len(set(initial_words))
    for i in dict_start:
        dict_start[i] = math.log(dict_start[i], 10)
    # Sort the dictionary based on value
    dict_start = dict(sorted(dict_start.items(), key = lambda item: item[1], reverse = True))
    # Add out-of-vocabulary (OOV) token
    dict_start['OOV'] = min(dict_start.values())
    return dict_start

c1_isd = return_dict_ISD(c1_1_train)
c1_isd

In [None]:
c1_isd['OOV']

In [None]:
c2_isd = return_dict_ISD(c2_1_train)
c2_isd

In [None]:
c2_isd['OOV']

In [None]:
for i in (c1_isd, c2_isd):
    assert 'OOV' in i, "ERROR: OOV token not present."
    assert min(i.values()) < 0, "ERROR"
    assert max(i.values()) < 0, "ERROR"


## Create STT (A)

In [None]:
vocab.index('writer')

In [None]:
c1_1_train.split(' ')

In [None]:
a = int(100)
a

In [None]:
def create_dict_STT(txt: str
                    ) -> dict:
    dict_stt = {i: {j: 1 for j in vocab} for i in vocab}
    train_corpus = txt.split(' ')
    for index in range(len(train_corpus) - 1):
        from_word = train_corpus[index]
        to_word   = train_corpus[index+1]
        dict_stt[from_word][to_word] += 1
    # count dict
    dict_counts = {i: train_corpus.count(i) + len(set(train_corpus)) for i in vocab}
    # Divide by counts
    min_value = int(100)
    for i in dict_counts:
        subdict = dict_stt[i]
        subdict = {j: math.log(subdict[j] / dict_counts[i], 10) for j in subdict}
        dict_stt[i] = subdict
        # Get smallest value for OOV
        min_value = min(
            min_value,
            min(subdict.values())
        )
    # Add OOV
    dict_stt['OOV'] = min_value
    return dict_stt

c1_stt = create_dict_STT(c1_1_train)
c1_stt

In [None]:
c1_stt['writer']['of']

In [None]:
c2_stt = create_dict_STT(c2_1_train)



In [None]:
# c2_stt['writer']['of']

In [None]:
c2_1_train.split(' ').count('writer')

In [None]:
c2_stt

In [None]:
for i in (c1_stt, c2_stt):
    assert 'OOV' in i, "ERROR: OOV not present."

# Classify sentence

## Classify one

In [None]:
c1_stt

In [None]:
a = 1
b = 32
c = 5 if a == 1 else 50
c

In [None]:
a = {
    'a': {
        'a': 5,
        'b': 10
    }
}

True if 'c' in a and 'b' in a['c'] else False

In [None]:
def predict_class(sentence: str,
                  verbose: bool = True
                  ):
    """
    Returns boolean prediction:
    - `0`: Edgar Allan Poe
    - `1`: Robert Frost
    """
    snt_proc = process_txt([sentence]).split(' ')

    probabilities = list()

    # Calculate probability of class 0
    proba_c0 = 0
    # first word
    first_word = snt_proc[0] if snt_proc[0] in c1_isd else 'OOV'
    proba_c0 += c1_isd[first_word]
    # all subsequent words
    for i in range(1, len(snt_proc) - 1):
        word, next_word = snt_proc[i], snt_proc[i+1]
        try:
            proba = c1_stt[word][next_word]
        except KeyError as e:
            print(f"{word} or {next_word} not present in vocab.")
            proba = c1_stt['OOV']
        proba_c0 += proba
    probabilities.append(proba_c0)

    # Calculate probability of class 1
    proba_c1 = 0
    # first word
    first_word = snt_proc[0] if snt_proc[0] in c2_isd else 'OOV'
    proba_c1 += c2_isd[first_word]
    for i in range(1, len(snt_proc) - 1):
        word, next_word = snt_proc[i], snt_proc[i+1]
        try:
            proba = c2_stt[word][next_word]
        except KeyError as e:
            print(f"{word} or {next_word} not present in vocab.")
            proba = c2_stt['OOV']
        proba_c1 += proba
    probabilities.append(proba_c1)

    if verbose:
        print("Log probabilities of each class:")
        print(f" - `0`: {proba_c0}")
        print(f" - `1`: {proba_c1}")
    return int(np.argmax(probabilities))


snt1 = 'Not long ago, the writer of these lines, In the mad pride of intellectuality oceane.'
predict_class(snt1)


## Test set

In [None]:
predictions = []
for i in x_test:
    predictions.append(predict_class(i))

predictions

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score
)

accuracy_score(
    y_test,
    predictions
)

In [None]:
precision_score(
    y_test,
    predictions
)

In [None]:
recall_score(
    y_test,
    predictions
)