In [6]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download Reuters corpus and other dependencies
nltk.download('reuters')
nltk.download('punkt')  # Required for tokenizing if needed

# Load the corpus
documents = reuters.fileids()
corpus = [reuters.raw(doc_id) for doc_id in documents]

# Basic corpus information
print(f"Total documents: {len(documents)}")
print(f"Sample document ID: {documents[0]}")
print(f"Sample document content (first 500 chars):\n{corpus[0][:500]}")



[nltk_data] Downloading package reuters to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total documents: 10788
Sample document ID: test/14826
Sample document content (first 500 chars):
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo


In [7]:
# Create and fit CountVectorizer
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(corpus)

# Vocabulary size
print(f"Vocabulary size: {len(count_vectorizer.vocabulary_)}")

# Matrix shape (documents x features)
print(f"Shape of BoW matrix: {bow_matrix.shape}")

# View first 10 terms in sorted order
sorted_vocab = sorted(count_vectorizer.vocabulary_.items(), key=lambda x: x[1])
print("First 10 terms in vocabulary (sorted by index):")
for term, idx in sorted_vocab[:10]:
    print(f"{idx}: {term}")


Vocabulary size: 30916
Shape of BoW matrix: (10788, 30916)
First 10 terms in vocabulary (sorted by index):
0: 00
1: 000
2: 0000
3: 00000
4: 0009
5: 001
6: 002
7: 003
8: 0037
9: 004


In [8]:
# Create and fit TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compare matrix shapes
print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")

# Compare representations for the same document
doc_index = 0
bow_nonzero = bow_matrix[doc_index].nonzero()[1]
tfidf_nonzero = tfidf_matrix[doc_index].nonzero()[1]
print(f"BoW non-zero terms in doc {doc_index}: {len(bow_nonzero)}")
print(f"TF-IDF non-zero terms in doc {doc_index}: {len(tfidf_nonzero)}")


Shape of TF-IDF matrix: (10788, 30916)
BoW non-zero terms in doc 0: 351
TF-IDF non-zero terms in doc 0: 351


In [9]:
# TF-IDF with stop words removed
tfidf_sw_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_sw_matrix = tfidf_sw_vectorizer.fit_transform(corpus)

# Vocabulary size comparison
print(f"Vocabulary size (with stop words): {len(tfidf_vectorizer.vocabulary_)}")
print(f"Vocabulary size (without stop words): {len(tfidf_sw_vectorizer.vocabulary_)}")

# Features in a sample document
doc_index = 0
print(f"Non-zero features in TF-IDF (with stop words): {tfidf_matrix[doc_index].nnz}")
print(f"Non-zero features in TF-IDF (without stop words): {tfidf_sw_matrix[doc_index].nnz}")


Vocabulary size (with stop words): 30916
Vocabulary size (without stop words): 30627
Non-zero features in TF-IDF (with stop words): 351
Non-zero features in TF-IDF (without stop words): 269


Questions:

Vocabulary size change?

Often decreases by 30–50% after removing stop words.

Features per document?

Generally fewer features — representation becomes more sparse but meaningful.

Impact on quality?

Removing stop words often improves performance for classification tasks by reducing noise.

In [10]:
def process_new_document(document, vectorizer):
    """
    Process a new document using trained vectorizer.
    
    Args:
        document (str): Text to process
        vectorizer: Trained CountVectorizer or TfidfVectorizer
    
    Returns:
        sparse matrix: Document vector
    """
    return vectorizer.transform([document])

# Example test
new_doc = "Stock markets saw a rise today due to strong economic indicators and lower inflation."
bow_vector = process_new_document(new_doc, count_vectorizer)
tfidf_vector = process_new_document(new_doc, tfidf_vectorizer)

print(f"BoW representation shape: {bow_vector.shape}, Non-zero terms: {bow_vector.nnz}")
print(f"TF-IDF representation shape: {tfidf_vector.shape}, Non-zero terms: {tfidf_vector.nnz}")


BoW representation shape: (1, 30916), Non-zero terms: 13
TF-IDF representation shape: (1, 30916), Non-zero terms: 13


Summary:

Loaded and explored the Reuters corpus to understand its structure and content using NLTK.

Created Bag-of-Words and TF-IDF representations using CountVectorizer and TfidfVectorizer, and compared their vocabulary sizes and matrix shapes.

Analyzed the effect of stop word removal, which reduced vocabulary size and made representations more focused and meaningful.

Built a function to process new documents using trained vectorizers, enabling consistent feature extraction for unseen text.