In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Downloading the CSV file
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [3]:
# Reading the CSV file into a Pandas DataFrame
df = pd.read_csv('bbc_text_cls.csv')

In [4]:
# Tokenizing the text using NLTK's word_tokenize function
tokenized_docs = [word_tokenize(doc.lower()) for doc in df['text']]

In [5]:
# Using CountVectorizer to generate term-frequency counts as a CSR matrix
vectorizer = CountVectorizer()
tf_csr = csr_matrix(vectorizer.fit_transform(df['text']))

In [6]:
# Convert the CSR matrix to dense array for further computations if needed
tf_dense = tf_csr.toarray()

In [7]:
# Compute IDF
document_freq = np.sum(tf_dense > 0, axis=0)
idf = np.log(len(df) / (1 + document_freq))

In [8]:
# Compute TF-IDF
tf_idf = tf_dense * idf

In [9]:
# Display the results
print("TF-IDF Matrix:")
print(tf_idf)

TF-IDF Matrix:
[[0.         1.51924807 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.51924807 0.         ... 0.         0.         0.        ]]


In [10]:
# Label and Text Example
label = "sport"
text = "Robinson answers critics"
print(f"\nLabel: {label}\nText: {text}")
print("Top 5 terms:", " ".join(vectorizer.get_feature_names_out()[
      np.argsort(tf_idf[df['text'].str.contains(text.lower()).idxmax()])[::-1][:5]]))


Label: sport
Text: Robinson answers critics
Top 5 terms: timewarner aol warner profit profits
