In [1]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Deepam
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pandas as pd
import pathlib

root_dir = pathlib.Path(r"D:\Deepam\bbc")

data = []

for category_dir in root_dir.iterdir():
    if category_dir.is_dir():
        label = category_dir.name

        for file_path in category_dir.glob("*.txt"):
            with open(file_path, "r", encoding="latin-1") as f:
                text = f.read()
                data.append({"text": text, "labels": label})

df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
# populate word2idx
# convert documents into sequence of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

        # Save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

### Step-by-Step Process
* Initialize:
    * idx = 0: Counter for assigning unique IDs.
    * word2idx = {}: Dictionary to map words → unique IDs.
    * tokenized_docs = []: List to store documents as sequences of IDs.

* Loop over each document:

    * Convert to lowercase and tokenize into words.
    * Create doc_as_int = empty list to hold word IDs.
    * Loop over each word in the document:

* If the word is not yet in word2idx:

    * Assign it the current idx.
    * Increment idx by 1.
    * Append the word’s ID to doc_as_int.

* After processing all words in the document:
    * Append doc_as_int to tokenized_docs.

#### Outputs
* word2idx: Dictionary of all unique words and their assigned IDs.
* tokenized_docs: List of all documents converted to sequences of IDs.

### Iteration 1
**Document: "I love NLP!"**

1. Tokenize and lowercase:
```python
words = word_tokenize("i love nlp!")
```
    * Let’s say word_tokenize gives:
```python
['i', 'love', 'nlp', '!']
```
2. Create empty list to hold indices:
```python
doc_as_int = []
Inner loop (for word in words)
Processing each word one by one:

First word: 'i'

word not in word2idx: True (it’s new).

Assign: word2idx['i'] = idx → 0

Increment idx: idx = 1

Append: doc_as_int.append(0)
```

* After this word:

    * word2idx = {'i': 0}
    * doc_as_int = [0]

* Second word: 'love'
* New word:
    * Assign: word2idx['love'] = 1
    * Increment: idx = 2
* Append index: 1

* After this word:
```python
word2idx = {'i': 0, 'love': 1}
doc_as_int = [0, 1]
```

* Third word: 'nlp'
* New word:
    * Assign: word2idx['nlp'] = 2
    * Increment: idx = 3
* Append index: 2

After this word:
```python
word2idx = {'i': 0, 'love': 1, 'nlp': 2}
doc_as_int = [0, 1, 2]
```

* Fourth word: '!'
* New word:
    * Assign: word2idx['!'] = 3
    * Increment: idx = 4
* Append index: 3

* After this word:
```python
word2idx = {'i': 0, 'love': 1, 'nlp': 2, '!': 3}
doc_as_int = [0, 1, 2, 3]
```
* Finished inner loop for doc 1.
* Append doc_as_int to tokenized_docs:
```python
tokenized_docs = [[0, 1, 2, 3]]
```

In [9]:
# reverse ampping
# if you do it smarter you can store it as a list
idx2word = {v:k for k, v in word2idx.items()}
idx2word

{0: 'ad',
 1: 'sales',
 2: 'boost',
 3: 'time',
 4: 'warner',
 5: 'profit',
 6: 'quarterly',
 7: 'profits',
 8: 'at',
 9: 'us',
 10: 'media',
 11: 'giant',
 12: 'timewarner',
 13: 'jumped',
 14: '76',
 15: '%',
 16: 'to',
 17: '$',
 18: '1.13bn',
 19: '(',
 20: 'â£600m',
 21: ')',
 22: 'for',
 23: 'the',
 24: 'three',
 25: 'months',
 26: 'december',
 27: ',',
 28: 'from',
 29: '639m',
 30: 'year-earlier',
 31: '.',
 32: 'firm',
 33: 'which',
 34: 'is',
 35: 'now',
 36: 'one',
 37: 'of',
 38: 'biggest',
 39: 'investors',
 40: 'in',
 41: 'google',
 42: 'benefited',
 43: 'high-speed',
 44: 'internet',
 45: 'connections',
 46: 'and',
 47: 'higher',
 48: 'advert',
 49: 'said',
 50: 'fourth',
 51: 'quarter',
 52: 'rose',
 53: '2',
 54: '11.1bn',
 55: '10.9bn',
 56: 'its',
 57: 'were',
 58: 'buoyed',
 59: 'by',
 60: 'one-off',
 61: 'gains',
 62: 'offset',
 63: 'a',
 64: 'dip',
 65: 'bros',
 66: 'less',
 67: 'users',
 68: 'aol',
 69: 'on',
 70: 'friday',
 71: 'that',
 72: 'it',
 73: 'owns',
 7

In [16]:
idx2word_list = [None]*len(idx2word)
for word, idx in word2idx.items():
    idx2word_list[idx] = word
idx2word_list

['ad',
 'sales',
 'boost',
 'time',
 'warner',
 'profit',
 'quarterly',
 'profits',
 'at',
 'us',
 'media',
 'giant',
 'timewarner',
 'jumped',
 '76',
 '%',
 'to',
 '$',
 '1.13bn',
 '(',
 'â£600m',
 ')',
 'for',
 'the',
 'three',
 'months',
 'december',
 ',',
 'from',
 '639m',
 'year-earlier',
 '.',
 'firm',
 'which',
 'is',
 'now',
 'one',
 'of',
 'biggest',
 'investors',
 'in',
 'google',
 'benefited',
 'high-speed',
 'internet',
 'connections',
 'and',
 'higher',
 'advert',
 'said',
 'fourth',
 'quarter',
 'rose',
 '2',
 '11.1bn',
 '10.9bn',
 'its',
 'were',
 'buoyed',
 'by',
 'one-off',
 'gains',
 'offset',
 'a',
 'dip',
 'bros',
 'less',
 'users',
 'aol',
 'on',
 'friday',
 'that',
 'it',
 'owns',
 '8',
 'search-engine',
 'but',
 'own',
 'business',
 'had',
 'has',
 'mixed',
 'fortunes',
 'lost',
 '464,000',
 'subscribers',
 'lower',
 'than',
 'preceding',
 'quarters',
 'however',
 'company',
 "'s",
 'underlying',
 'before',
 'exceptional',
 'items',
 'back',
 'stronger',
 'advert

In [11]:
# number of documents
N = len(df['text'])

In [12]:
# Number of words
V = len(word2idx)

In [14]:
N, V

(2225, 34613)

In [17]:
# instantiate term-frequency matrix
# note: could have also used count vectorizer
tf = np.zeros((N,V))

### Example
* Suppose:
1. N = 3 docs
2. V = 6 words (i, love, nlp, !, is, fun)

* Then:
```python
tf = np.zeros((3, 6))
```
This gives:
```python
array([
 [0,0,0,0,0,0],   # document 1
 [0,0,0,0,0,0],   # document 2
 [0,0,0,0,0,0],   # document 3
])
```

In [18]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

```python
tokenized_docs = [
    [0, 1, 2, 3],  # Document 0: "i love nlp !"
    [2, 4, 5],     # Document 1: "nlp is fun"
    [0, 1, 5]      # Document 2: "i love fun"
]
```

* Final TF matrix is:
```python
tf = [
 [1, 1, 1, 1, 0, 0],  # "i love nlp !"
 [0, 0, 1, 0, 1, 1],  # "nlp is fun"
 [1, 1, 0, 0, 0, 1]   # "i love fun"
]
```

In [23]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V,))
idf = np.log(N / document_freq)
idf

array([5.22260554, 2.3893922 , 2.86332511, ..., 7.70751219, 7.70751219,
       7.70751219])

```python
idf[0] = log(3/2) = log(1.5)
idf[1] = log(3/2)
idf[2] = log(3/2)
idf[3] = log(3/1) = log(3)
idf[4] = log(3/1) = log(3)
idf[5] = log(3/2)
```

### Interpretation:
* Words appearing in many docs → lower IDF.
* Words appearing in few docs → higher IDF.

In [20]:
# compute TF-IDF
tf_idf = tf * idf

* tf is a matrix shape (3,6)
* idf is a vector shape (6,)

* NumPy automatically broadcasts idf over all rows.
```python
Doc 0: [1,1,1,1,0,0]
Calculation:
[1*0.405, 1*0.405, 1*0.405, 1*1.0986, 0*1.0986, 0*0.405]

Result:
[0.405, 0.405, 0.405, 1.0986, 0, 0]
```
* **Interpretation:**

    * "i" contributes 0.405

    * "love" contributes 0.405

    * "nlp" contributes 0.405

    * "!" contributes 1.0986 (rare word, high weight)

In [22]:
np.random.seed(123)

In [26]:
# Pick a random document, show the top 5 terms (in terms of tf-idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])

Label: sport
Text: Hingis hints at playing comeback
Top 5 terms:
hingis
pattaya
thailand
95th
30th


In [27]:
# Exercise: use CountVectorizer to form the counts instead

# Exercise (hard): use Scipy's csr_matrix instead
# You cannot use X[i, j] += 1 here