https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

In [1]:
# Open and read Edgar Allan Poe
with open(r"D:\Deepam\Projects\Text Classifier\edgar_allan_poe.txt", "r", encoding="utf-8") as f:
    poe_text = f.read()

# Open and read Robert Frost
with open(r"D:\Deepam\Projects\Text Classifier\robert_frost.txt", "r", encoding="utf-8") as f:
    frost_text = f.read()

# Print first 500 characters to see the content
print("=== Edgar Allan Poe ===")
print(poe_text[:500])
print("\n=== Robert Frost ===")
print(frost_text[:500])

=== Edgar Allan Poe ===
LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!
Time-eaten towers that tremble not!
Resemble nothing that is ours.
Around, by lifting winds forgot,
Resignedly beneath the sky
The melancholy waters lie.
 
No holy rays from h

=== Robert Frost ===
Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there
Had worn them really about the same,

And both that morning equally lay
In leaves no step had trodden black.
Oh, I kept t

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [10]:
input_files = [
    r'D:\Deepam\Projects\Text Classifier\edgar_allan_poe.txt',
     r'D:\Deepam\Projects\Text Classifier\robert_frost.txt',
]

In [11]:
# collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")

    for line in open(f):
        line = line.rstrip().lower()
        if line:
            # remove punctuation
            line = line.translate(str.maketrans('', '', string.punctuation))

            input_texts.append(line)
            labels.append(label)

D:\Deepam\Projects\Text Classifier\edgar_allan_poe.txt corresponds to label 0
D:\Deepam\Projects\Text Classifier\robert_frost.txt corresponds to label 1


In [12]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [13]:
len(Ytrain), len(Ytest)

(1618, 540)

In [14]:
train_text[:5]

['and laugh but smile no more',
 'a cliff and on the cliff a bottle painted',
 'stillgoing every which way in the joints though',
 'some shattered dishes underneath a pine',
 'but get some color and music out of life']

In [15]:
Ytrain[:5]

[0, 1, 1, 1, 1]

In [16]:
idx = 1
word2idx = {"<unk>" : 0}

In [17]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [18]:
word2idx

{'<unk>': 0,
 'and': 1,
 'laugh': 2,
 'but': 3,
 'smile': 4,
 'no': 5,
 'more': 6,
 'a': 7,
 'cliff': 8,
 'on': 9,
 'the': 10,
 'bottle': 11,
 'painted': 12,
 'stillgoing': 13,
 'every': 14,
 'which': 15,
 'way': 16,
 'in': 17,
 'joints': 18,
 'though': 19,
 'some': 20,
 'shattered': 21,
 'dishes': 22,
 'underneath': 23,
 'pine': 24,
 'get': 25,
 'color': 26,
 'music': 27,
 'out': 28,
 'of': 29,
 'life': 30,
 'shes': 31,
 'after': 32,
 'cider': 33,
 'old': 34,
 'girl': 35,
 'thirsty': 36,
 'if': 37,
 'anyone': 38,
 'had': 39,
 'seen': 40,
 'me': 41,
 'coming': 42,
 'home': 43,
 'dreams': 44,
 'thee': 45,
 'therein': 46,
 'knows': 47,
 'are': 48,
 'there': 49,
 'they': 50,
 'said': 51,
 'it': 52,
 'isnt': 53,
 'going': 54,
 'to': 55,
 'rain': 56,
 'how': 57,
 'horrible': 58,
 'monody': 59,
 'floats': 60,
 'rather': 61,
 'than': 62,
 'tip': 63,
 'table': 64,
 'for': 65,
 'you': 66,
 'let': 67,
 'arose': 68,
 'with': 69,
 'duplicate': 70,
 'horn': 71,
 'our': 72,
 'talk': 73,
 'been': 74,

In [19]:
len(word2idx)

2521

In [22]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

### Why get(token,0)?
* In the test set, you might have words never seen in training.
* get() safely returns:
    * the ID if found,
    * 0 if the word is unknown (UNK token).

In [23]:
train_text_int[100:105]

[[315, 92, 383, 55, 384, 55, 385],
 [10, 386, 387, 17, 44, 92, 102],
 [262, 388, 9, 10, 389, 390, 174],
 [29, 391, 392],
 [281, 50, 393, 394, 17, 10, 395]]

In [24]:
# initalize A and pi matrices - for both classes
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [25]:
# compute count for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in a sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition
                A[last_idx, idx] += 1

            # update last idx
            last_idx = idx

compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y==1], A1, pi1)

```python
train_text_int = [
    [0,1,2],
    [1,2],
    [0,2],
    [2,1]
]

Ytrain = [0,1,0,1]

```

* zip(train_text_int, Ytrain) produces:
```python
[
    ([0,1,2], 0),
    ([1,2], 1),
    ([0,2], 0),
    ([2,1],1)
]
The list comprehension picks only those where y==0:

[
    [0,1,2],
    [0,2]
]
So:

compute_counts(
    [
      [0,1,2],
      [0,2]
    ],
    A0,
    pi0
)
```
means:
Count all the transitions and sentence starts for sentences in class 0.

* Similarly:
```python
[t for t, y in zip(train_text_int, Ytrain) if y==1]
Produces:

[
    [1,2],
    [2,1]
]
So:

compute_counts(
    [
      [1,2],
      [2,1]
    ],
    A1,
    pi1
)
```
means:
Count all the transitions and sentence starts for sentences in class 1.

# ***The Complete Intuition behind Text classifier and Markov's Model (Atleast read it completely once)***

## 📘 Big Picture: What are we modeling?

### ✅ Goal: 
Given a sequence of words, estimate:  
**P(sequence | class)**

---

### 🧠 Assumption:  
The sequence follows a **Markov Chain** (bigram model):  

$$
P(w_1, w_2, \dots, w_n) = P(w_1) \cdot \prod_{t=2}^{n} P(w_t \mid w_{t-1})
$$



We model this **separately for each class (author)**.

---

### 🧩 To estimate this for each class, we need:
1. How likely each word is to **start a sentence** → stored in **`pi`**
2. How likely each word is to **follow any previous word** → stored in **`A`**

---

## ✨ Variables Explained

### 🟢 `pi` vectors  
- `pi0`: counts of how often each word starts a sentence in **class 0**
- `pi1`: counts of how often each word starts a sentence in **class 1**

This is the empirical estimate of:

\[
\pi[i] \propto \text{Count(word i starts a sentence)}
\]

**Intuition**:  
Some authors start lines with specific words more often.  
- Poe might start more lines with “The”  
- Frost might often use “I”

---

### 🟢 `A` matrices  
- `A0`: counts of how often each word follows another in **class 0**
- `A1`: counts of how often each word follows another in **class 1**

If vocabulary size is `V`, then:  
- `A0` and `A1` are `V x V` matrices  
- `A[i][j]` means:  
  _"How many times did word j come after word i?"_

**Intuition**:  
This captures each author’s style of combining words.  
- Poe might often say “dark night”  
- Frost might often say “snow fell”

---

## 🟢 Example in Plain English

### 📄 Toy Vocabulary:
| Index | Word  |
|-------|-------|
| 0     | the   |
| 1     | night |
| 2     | dark  |

### 📚 Sentences in class 0:
1. “the dark night”  
2. “the night”

---

### ✅ Step 1: Initialize counts with smoothing
```python
A0 = np.ones((3, 3))  # 3x3 matrix
pi0 = np.ones(3)      # vector of size 3


## ✅ Step 2: Compute counts

---

### 📄 Sentence 1: “the dark night”

- **First word**: “the” → index `0`  
  → `pi0[0] += 1`

- **Transitions:**
  - “the → dark”: `A0[0][2] += 1`
  - “dark → night”: `A0[2][1] += 1`

---

### 📄 Sentence 2: “the night”

- **First word**: “the”  
  → `pi0[0] += 1`

- **Transition:**
  - “the → night”: `A0[0][1] += 1`

---

## 📊 After processing:

### `pi0` vector:

[1+2, 1, 1] = [3, 1, 1]

→ Word 0 (“the”) started 2 sentences.

---

### `A0` matrix:

| from \ to | the | night | dark |
|-----------|-----|-------|------|
| the       |  1  |  2    |  2   |
| night     |  1  |  1    |  1   |
| dark      |  1  |  2    |  1   |

- From “the”, mostly “night” or “dark” followed.
- From “dark”, “night” followed.
- Other transitions were not seen, so they remain 1 (from smoothing).

---

## 🧠 Interpretation

- `pi0`: tells you **“the” frequently starts sentences**.
- `A0`: tells you which **word combinations (bigrams)** are common for that author.

This is a **bigram language model learned per class**.

---

## 🤖 Why do we need them?

At **prediction time**, for a **new sentence**:

1. Use `pi` and `A` to **calculate the likelihood of the sequence under each class**.
2. **Multiply all the probabilities** (with smoothing).
3. Predict the **class with the highest overall probability**.

---

## 🚫 What if you didn’t compute A and pi?

You would have:

- No way to **estimate how sequences are formed**
- No way to **compare which author’s style matches**
- Your **classifier wouldn’t work**

---

## ✨ Short Summary

| Concept      | Meaning                                         |
|--------------|-------------------------------------------------|
| `pi`         | Starting word counts                           |
| `A`          | Word transition (bigram) counts                |
| Each class   | Has its own `pi` and `A`                       |
| Intuition    | This is the **Markov model** of each author's sentence structure |

In [26]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before

A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

# keepdims = True, ensures that the sum is still two dimensional, which is required for 
# the division to broadcast correctly in NumPy.

* Example:
* Suppose you had this counts matrix after smoothing:
```python
A0 = [[1, 3, 6],
      [2, 2, 2],
      [4, 1, 1]]
Sum of row 0: 1+3+6=10.

So row 0 becomes: [0.1, 0.3, 0.6].
```

In [27]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [28]:
# compute priors
count0 = sum(y==0 for y in Ytrain)
count1 = sum(y==1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.33498145859085293, 0.6650185414091471)

* Interpretation:
    * 66.5% of the training data comes from class 0.
    * 33.5% comes from class 1.
* These are your priors.

## 🎯 What is Maximum Likelihood?

**Maximum Likelihood (ML)** says:

> *"Pick the class that makes the observed data most likely, ignoring priors."*

**Mathematically:**
$$
\hat{y} = \arg\max_c \; P(data \mid c)
$$

✅ **Example**

Imagine you have:

- **Class 0:** very common phrases (high likelihood for many sentences)
- **Class 1:** very rare phrases (low likelihood for most sentences)

Suppose your priors are:

- **p0 = 0.01** (very rare class 0)
- **p1 = 0.99** (almost everything is class 1)

If you **only look at likelihood**, you might still choose class 0 because it happens to fit the sentence better, even though almost nothing comes from class 0.

**Problem:**

This ignores the fact that class 0 is almost never seen.

---

## 🎯 2️⃣ What is Posterior Probability (MAP)?

**Maximum A Posteriori (MAP)** says:

> *"Pick the class with the highest posterior probability, which combines:*  
> *Likelihood × Prior."*

**Mathematically:**

$$
\hat{y} = \arg\max_c \; \underbrace{P(\text{data} \mid c)}_{\text{likelihood}} \times \underbrace{P(c)}_{\text{prior}}
$$

**Intuition:**

- Even if class 0 has higher likelihood for this sentence, you still prefer class 1 if it’s overall much more common.
- This is **Bayesian thinking**—taking into account both:
  - How likely the sentence is under the class model
  - How likely you are to see that class in general


In [39]:
# build a classifer
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes

    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
                # it's the first token
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]

            # update last_idx
            last_idx = idx

        return logprob

    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
                          for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [40]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [41]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == Ytrain)}")

Train acc: 0.9962917181705809


In [42]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == Ytest)}")

Test acc: 0.8277777777777777


In [44]:
from sklearn.metrics import confusion_matrix, f1_score

In [49]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 536,    6],
       [   0, 1076]], dtype=int64)

In [53]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

array([[ 96,  84],
       [  9, 351]], dtype=int64)

In [54]:
f1_score(Ytrain, Ptrain)

0.9972196478220574

In [55]:
f1_score(Ytest, Ptest)

0.8830188679245283