# TFIDF demo

Based on https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata

In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_csv("bbc_text_cls.csv")
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [48]:
# Now I create a dictionary word2idx, with a mapping between words (keys) and indices in vocab (values)

idx = 0
word2idx = {}
tokenized_docs = []

for doc in df["text"]:
    words = word_tokenize(doc.lower())
    doc_as_int = []

    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

        doc_as_int.append(word2idx[word]) # Save for later, these are the indices

    tokenized_docs.append(doc_as_int)

In [49]:
# do the reverse mapping idx2word

idx2word = {value:key for key, value in word2idx.items()} # Can be also done it as a list instead of a dictionary

In [100]:
# Number of documents and number of words

N = len(df["text"]) # Can be len(df) if I assume no Nulls in the df
V = len(word2idx) # Or the other one
print(N)
print(V)

2225
34762


In [53]:
# Now instantiate the tf matrix. I'm not using a sparse matrix because I can. This is not looking for efficiency.

tf = np.zeros((N, V))

# And then populate the matrix

for i, doc_as_int in enumerate(tokenized_docs): # Take each document and its content (as indices)
    for j in doc_as_int: # Loop through each word and count it
        tf[i, j] += 1 # In the end this is a count!

In [54]:
# I have tf, I now need IDF. document_freq will have shape (V,). 

document_freq = np.sum(tf > 0, axis = 0) # We only sum the words that appear, axis = 0 is sum by column (word) as in pandas

idf = np.log(N / document_freq)

# Compute tf-idf

tf_idf = tf * idf

In [58]:
# Check the shape just to see it
document_freq

array([ 12, 204, 127, ...,   1,   1,   1])

In [72]:
np.random.seed(13)

# Pick a random document, show the top 5 terms in terms of tf_idf score

i = np.random.choice(N) # Document index

row = df.iloc[i]
print("Label:", row["labels"])
print("Text:", row["text"].split("\n", 1)[0])

print("Top 5 terms in tf_idf score:") # In some way these are the 5 words that best represent the article (the most important words for the article)

scores = tf_idf[i]
indices = (-scores).argsort() #It's ascending
for i in indices[:5]:
    print(idx2word[i])

Label: business
Text: India's Deccan seals $1.8bn deal
Top 5 terms in tf_idf score:
deccan
air
india
airlines
1.8bn


#### Use Scikit-learn count vectorizer instead of counting myself
* I get already a sparse matrix
* I don't have to count

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [74]:
corpus = df["text"]

counted_vectorized = cv.fit_transform(corpus)

In [82]:
print(type(counted_vectorized)) # To check it's ok
#print(counted_vectorized.toarray())

<class 'scipy.sparse.csr.csr_matrix'>


In [86]:
# get the starting point back

reverse = cv.inverse_transform(counted_vectorized)
#print(reverse)

[array(['ad', 'sales', 'boost', 'time', 'warner', 'profit', 'quarterly',
       'profits', 'at', 'us', 'media', 'giant', 'timewarner', 'jumped',
       '76', 'to', '13bn', '600m', 'for', 'the', 'three', 'months',
       'december', 'from', '639m', 'year', 'earlier', 'firm', 'which',
       'is', 'now', 'one', 'of', 'biggest', 'investors', 'in', 'google',
       'benefited', 'high', 'speed', 'internet', 'connections', 'and',
       'higher', 'advert', 'said', 'fourth', 'quarter', 'rose', '11',
       '1bn', '10', '9bn', 'its', 'were', 'buoyed', 'by', 'off', 'gains',
       'offset', 'dip', 'bros', 'less', 'users', 'aol', 'on', 'friday',
       'that', 'it', 'owns', 'search', 'engine', 'but', 'own', 'business',
       'had', 'has', 'mixed', 'fortunes', 'lost', '464', '000',
       'subscribers', 'lower', 'than', 'preceding', 'quarters', 'however',
       'company', 'underlying', 'before', 'exceptional', 'items', 'back',
       'stronger', 'advertising', 'revenues', 'hopes', 'increase',
 

#### Reproduce the tf_idf calculation with my count vectorizer input

In [101]:
doc_freq = np.sum(counted_vectorized > 0, axis = 0)

idf_c = np.log(len(corpus) / doc_freq)

# Compute tf-idf

tf_idf_c = doc_freq.T * idf_c

In [102]:
print(doc_freq.T.shape)
print(idf_c.shape)
print(tf_idf_c.shape)

(29421, 1)
(1, 29421)
(29421, 29421)


In [135]:
i = np.random.choice(N)

row_c = df.iloc[i]
print("Label:", row_c["labels"])
print("Text:", row_c["text"].split("\n", 1)[0])
rev_c = reverse[i]

print("Top 5 terms in tf_idf score:") # In some way these are the 5 words that best represent the article (the most important words for the article)

# The error must be in the way I access reverse or something with the count vectorizer.inverse_transform output

scores_c = tf_idf_c[i].T
indices_c = (-scores_c).argsort() #It's ascending
print(indices_c)
for j in reverse[:5]:
    print(rev_c[j])

Label: entertainment
Text: De Niro film leads US box office
Top 5 terms in tf_idf score:
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


IndexError: arrays used as indices must be of integer (or boolean) type

In [134]:
scores_c.shape

(29421, 1)