# Building TF-IDF from scratch

In [2]:
# BBC News data

# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [1]:
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd

# nltk word tokeniser
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/sixsous/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()


Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [3]:
# populate word2idx mapping. This is necessary to know which word maps to which column.
# convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

        # save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)


In [4]:
# reverse mapping
# if you do it smarter you can store it as a list (it is inefficient to store it into a dictionary)
idx2word = {v: k for k, v in word2idx.items()}

# number of documents
N = len(df['text'])

# number of unique words
V = len(word2idx)

# instantiate term-frequency matrix
# note: could have also used count vectorizer


tf = np.zeros((N, V))


In [5]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        # term frequency matrix
        tf[i, j] += 1


In [None]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0)  # document frequency (shape = (V,))
idf = np.log(N / document_freq)

# compute TF-IDF
tf_idf = tf * idf


In [None]:
# compute TF-IDF
tf_idf = tf * idf

# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])


##Exercises:
* Use CountVertorizer to form the counts
* Use Scipy's csr_matrix instead of a numpy array. Note : with this we won't be able to do tf[i, j] += 1