## Using countvectorizer instead of TF-IDF

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Download the dataset
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [3]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
# Tokenizing the text using NLTK's word_tokenize function
tokenized_docs = [word_tokenize(doc.lower()) for doc in df['text']]


In [5]:
# Use countvectorizer to generate term-frequency counts
vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(df['text']).toarray()

In [6]:
#compute IDF
document_freq = np.sum(tf>0,axis=0)
idf = np.log(len(df) / (1+ document_freq))

In [7]:
tf_idf = tf*idf

In [8]:
# Display the results
print('TF-IDF Matrix:')
print(tf_idf)

TF-IDF Matrix:
[[0.         1.51924807 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.51924807 0.         ... 0.         0.         0.        ]]


In [10]:
# Label- Text Example
label = 'sport'
text = 'Robinson answers critics'
print(f'\nlabel: {label}\nText: {text}')
print('Top 5 terms:',' '.join(vectorizer.get_feature_names_out()[np.argsort(tf_idf[df['text'].str.contains(text.lower()).idxmax()])[::-1][:5]]))


label: sport
Text: Robinson answers critics
Top 5 terms: timewarner aol warner profit profits
