<a href="https://colab.research.google.com/github/1nsomnes/TextClassification/blob/main/TextClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2025-03-03 20:42:24--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2025-03-03 20:42:24 (16.6 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]

--2025-03-03 20:42:24--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving 

In [2]:
# imports

from collections import defaultdict
import numpy as np
import math
import random

In [3]:
edgar_lines = []
robert_lines = []

with open("edgar_allan_poe.txt") as f:
  for line in f:
    line = line.strip()
    if line != "":
      edgar_lines.append(line.strip())

with open("robert_frost.txt") as f:
  for line in f:
    line = line.strip()
    line = line.strip(",'.!-?")
    if line != "":
      #print(line)
      robert_lines.append(line.strip())

print(f"Length of Edgar Lines: {len(edgar_lines)}")
print(f"Length of Robert Lines: {len(robert_lines)}")



Length of Edgar Lines: 718
Length of Robert Lines: 1436


In [4]:
edgar_train = edgar_lines[:int(0.8*len(edgar_lines))]
edgar_test = edgar_lines[int(0.8*len(edgar_lines)):]

robert_train = robert_lines[:int(0.8*len(robert_lines))]
robert_test = robert_lines[int(0.8*len(robert_lines)):]

print(f"Length of Edgar Train: {len(edgar_train)}, Length of Edgar Test: {len(edgar_test)}")
print(f"Length of Robert Train: {len(robert_train)}, Length of Robert Test: {len(robert_test)}")

Length of Edgar Train: 574, Length of Edgar Test: 144
Length of Robert Train: 1148, Length of Robert Test: 288


In [5]:
combined_test = []
for i, line in enumerate(edgar_test):
    combined_test.append((line, True))
for i, line in enumerate(robert_test):
    combined_test.append((line, False))
print(f"Length of combined test: {len(combined_test)}")


Length of combined test: 432


In [6]:
# helper functions

def process_word(word):
  return word.lower().strip(".,-\"!:()?")

def build_markov_model(train):
  word_dic = {}

  index = 0
  train_cleaned = []

  # create the word data
  for line in train:
    words = line.split()
    line_cleaned = []
    for i in range(len(words)):

      word = process_word(words[i])
      if word == "":
        continue
      line_cleaned.append(word)
      if word not in word_dic:
        word_dic[word] = index
        index += 1
    train_cleaned.append(line_cleaned)

  A = np.ones((index+1, index+1))
  pi = np.ones((index+1,1))

  for line in train_cleaned:
    for i in range(len(line)):

      word = line[i]
      prev_word = line[i-1]

      if i == 0:
        pi[word_dic[word]] += 1
      else:
        A[word_dic[prev_word],word_dic[word]] += 1

  pi = np.log(pi/(index+1))
  for m in range(index+1):
    log_m_sum = math.log(A[m, :].sum()+m) # add  m to correct for add one smoothing
    A[m, :] = np.log(A[m, :]) - log_m_sum

  return (A, pi, word_dic)



In [7]:
# create probability matrices

A, pi, edgar_dic = build_markov_model(edgar_train)
A2, pi2, robert_dic = build_markov_model(robert_train)

In [8]:
# similarity comparison

def is_edgar(sentence):
  words = sentence.split()
  sequence = []
  for word in words:
    word = process_word(word)
    if word == "":
      continue
    sequence.append(word)

  edgar_sum = 0
  robert_sum = 0

  if sequence[0] in edgar_dic:
    edgar_sum = pi[edgar_dic[sequence[0]]]
  if sequence[0] in robert_dic:
    robert_sum = pi2[robert_dic[sequence[0]]]

  for i in range(1,len(sequence)):
    if sequence[i] in edgar_dic and sequence[i-1] in edgar_dic:
      edgar_sum += A[edgar_dic[sequence[i-1]], edgar_dic[sequence[i]]]
    else:
      edgar_sum += math.log(A.shape[0])
    if sequence[i] in robert_dic and sequence[i-1] in robert_dic:
      robert_sum += A2[robert_dic[sequence[i-1]], robert_dic[sequence[i]]]
    else:
      robert_sum += math.log(A2.shape[0])

  if robert_sum > edgar_sum:
    return False
  return True

In [9]:
random.shuffle(combined_test)
correct = 0
for test in combined_test:
  if is_edgar(test[0]) == test[1]:
    correct += 1
print(f"Accuracy: {correct/len(combined_test)}")

Accuracy: 0.2152777777777778
