## Basic Text Representation

In [73]:
# Import Necessary Libraries
import nltk
import pandas as pd
from nltk.corpus import reuters
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download the Reuters Corpus
nltk.download("reuters")

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Nikolai\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [74]:
# Read Reuters Corpus
documents = reuters.fileids()
corpus = [reuters.raw(doc_id) for doc_id in documents]

# Print Details
print(f"Number of documents in the Reuters corpus: {len(corpus)}") # Print number of documents
print(f"Total number of characters in the Reuters corpus: {sum(len(doc) for doc in corpus)}")  # Print total number of characters
print(f"\n\tFirst document in the Reuters corpus:\n\n{corpus[0][:500]}")  # Print first 500 characters of the first document
print(f"\n\tLast document in the Reuters corpus:\n\n{corpus[-1][:500]}")  # Print first 500 characters of the last document

Number of documents in the Reuters corpus: 10788
Total number of characters in the Reuters corpus: 8846853

	First document in the Reuters corpus:

ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo

	Last document in the Reuters corpus:

&lt;A.H.A. AUTOMOTIVE TECHNOLOGIES CORP> YEAR NET
  Shr 43 cts vs 52 cts
      Shr diluted 41 cts vs 49 cts
      Net 1,916,000 vs 2,281,000
      Revs 32.6 mln vs 22.6 mln
  




## Create Bag-of-Words Representation

Basic

In [75]:
# Initialize CountVectorizer
bow_vectorizer = CountVectorizer(
    stop_words = "english", # Remove English stop words
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b\w+\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Fit and Transform
X_bow = bow_vectorizer.fit_transform(corpus)

# Convert to DataFrame
bow_df = pd.DataFrame(
    X_bow.toarray(), # Convert sparse matrix to dense array
    columns = bow_vectorizer.get_feature_names_out() # Get feature names
)

# Print BoW Vocabulary Size
print(f"\nVocabulary Size: {len(bow_vectorizer.vocabulary_)}")  # Print vocabulary size
print("\nSorted Vocabulary:\n", sorted(bow_vectorizer.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary
# Print BoW DataFrame
print("\nBag-of-Words DataFrame:\n", bow_df.head())  # Print first few rows of the BoW DataFrame
print(f"\nShape of Bag-of-Words DataFrame: {bow_df.shape}")  # Print shape of the BoW DataFrame
print(f"\nBoW Representation: \n{bow_df}") # Print the BoW Representation



Vocabulary Size: 30661

Sorted Vocabulary:
 [('0', 0), ('00', 1), ('000', 2), ('0000', 3), ('00000', 4), ('0009', 5), ('001', 6), ('002', 7), ('003', 8), ('0037', 9), ('004', 10), ('005', 11), ('0053', 12), ('006', 13), ('006913', 14), ('006916', 15), ('007', 16), ('007050', 17), ('007100', 18), ('007150', 19)]

Bag-of-Words DataFrame:
    0  00  000  0000  00000  0009  001  002  003  0037  ...  zuckerman  zuheir  \
0  0   0    0     0      0     0    0    0    0     0  ...          0       0   
1  0   0    0     0      0     0    0    0    0     0  ...          0       0   
2  0   0    0     0      0     0    0    0    0     0  ...          0       0   
3  0   0    0     0      0     0    0    0    0     0  ...          0       0   
4  0   0    0     0      0     0    0    0    0     0  ...          0       0   

   zulia  zurich  zuyuan  zverev  zwermann  zy  zzzz  üside  
0      0       0       0       0         0   0     0      0  
1      0       0       0       0         0   0   

Numbers Removed

In [76]:
# Initialize CountVectorizer
bow_vectorizer_2 = CountVectorizer(
    stop_words = "english", # Remove English stop words
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b[a-zA-Z]{1,}\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Fit and Transform
X_bow_2 = bow_vectorizer_2.fit_transform(corpus)

# Convert to DataFrame
bow_df_2 = pd.DataFrame(
    X_bow_2.toarray(), # Convert sparse matrix to dense array
    columns = bow_vectorizer_2.get_feature_names_out() # Get feature names
)

# Print BoW Vocabulary Size
print(f"\nVocabulary Size: {len(bow_vectorizer_2.vocabulary_)}")  # Print vocabulary size
print("\nSorted Vocabulary:\n", sorted(bow_vectorizer_2.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary

# Print BoW DataFrame
print("\nBag-of-Words DataFrame:\n", bow_df_2.head())  # Print first few rows of the BoW DataFrame
print(f"\nShape of Bag-of-Words DataFrame: {bow_df_2.shape}")  # Print shape of the BoW DataFrame
print(f"\nBoW Representation: \n{bow_df_2}") # Print the BoW Representation


Vocabulary Size: 28881

Sorted Vocabulary:
 [('aa', 0), ('aaa', 1), ('aabex', 2), ('aac', 3), ('aachener', 4), ('aagiy', 5), ('aaica', 6), ('aaix', 7), ('aam', 8), ('aame', 9), ('aancor', 10), ('aap', 11), ('aar', 12), ('aare', 13), ('aarn', 14), ('aarnoud', 15), ('aaron', 16), ('aart', 17), ('aati', 18), ('ab', 19)]

Bag-of-Words DataFrame:
    aa  aaa  aabex  aac  aachener  aagiy  aaica  aaix  aam  aame  ...  \
0   0    0      0    0         0      0      0     0    0     0  ...   
1   0    0      0    0         0      0      0     0    0     0  ...   
2   0    0      0    0         0      0      0     0    0     0  ...   
3   0    0      0    0         0      0      0     0    0     0  ...   
4   0    0      0    0         0      0      0     0    0     0  ...   

   zuccherifici  zuckerman  zuheir  zulia  zurich  zuyuan  zverev  zwermann  \
0             0          0       0      0       0       0       0         0   
1             0          0       0      0       0       0      

## Create TF-IDF Representation

Basic

In [77]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b\w+\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Genreate TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), # Convert sparse matrix to dense array
    columns = tfidf_vectorizer.get_feature_names_out() # Get feature names
)

# Print TF-IDF Vocabulary Size
print(f"\nTF-IDF Vocabulary Size: {len(tfidf_vectorizer.vocabulary_)}")  # Print vocabulary size
print("\nSorted TF-IDF Vocabulary:\n", sorted(tfidf_vectorizer.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary
# Print TF-IDF DataFrame
print("\nTF-IDF DataFrame:\n", tfidf_df.head())  # Print first few rows of the TF-IDF DataFrame
print(f"\nShape of TF-IDF DataFrame: {tfidf_df.shape}") # Print shape of the TF-IDF DataFrame
print(f"\nTF-IDF Representation: \n {tfidf_df}") # Print the TF-IDF representation


TF-IDF Vocabulary Size: 30952

Sorted TF-IDF Vocabulary:
 [('0', 0), ('00', 1), ('000', 2), ('0000', 3), ('00000', 4), ('0009', 5), ('001', 6), ('002', 7), ('003', 8), ('0037', 9), ('004', 10), ('005', 11), ('0053', 12), ('006', 13), ('006913', 14), ('006916', 15), ('007', 16), ('007050', 17), ('007100', 18), ('007150', 19)]

TF-IDF DataFrame:
      0   00  000  0000  00000  0009  001  002  003  0037  ...  zuckerman  \
0  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
1  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
2  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
3  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
4  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   

   zuheir  zulia  zurich  zuyuan  zverev  zwermann   zy  zzzz  üside  
0     0.0    0.0     0.0     0.0     0.0       0.0  0.0   0.0    0.0  
1     0.0    0.0     0.0     0.0     0.0       0.0  0

Numbers Removed

In [78]:
# Initialize TfidfVectorizer
tfidf_vectorizer_2 = TfidfVectorizer(
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b[a-zA-Z]{1,}\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Genreate TF-IDF representation
X_tfidf_2 = tfidf_vectorizer_2.fit_transform(corpus)

# Convert to DataFrame
tfidf_df_2 = pd.DataFrame(
    X_tfidf_2.toarray(), # Convert sparse matrix to dense array
    columns = tfidf_vectorizer_2.get_feature_names_out() # Get feature names
)

# Print TF-IDF Vocabulary Size
print(f"\nTF-IDF Vocabulary Size: {len(tfidf_vectorizer_2.vocabulary_)}")  # Print vocabulary size
print("\nSorted TF-IDF Vocabulary:\n", sorted(tfidf_vectorizer_2.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary
# Print TF-IDF DataFrame
print("\nTF-IDF DataFrame:\n", tfidf_df_2.head())  # Print first few rows of the TF-IDF DataFrame
print(f"\nShape of TF-IDF DataFrame: {tfidf_df_2.shape}") # Print shape of the TF-IDF DataFrame
print(f"\nTF-IDF Representation: \n {tfidf_df_2}") # Print the TF-IDF representation


TF-IDF Vocabulary Size: 29172

Sorted TF-IDF Vocabulary:
 [('a', 0), ('aa', 1), ('aaa', 2), ('aabex', 3), ('aac', 4), ('aachener', 5), ('aagiy', 6), ('aaica', 7), ('aaix', 8), ('aam', 9), ('aame', 10), ('aancor', 11), ('aap', 12), ('aar', 13), ('aare', 14), ('aarn', 15), ('aarnoud', 16), ('aaron', 17), ('aart', 18), ('aati', 19)]

TF-IDF DataFrame:
           a   aa  aaa  aabex  aac  aachener  aagiy  aaica  aaix  aam  ...  \
0  0.115292  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0  ...   
1  0.051554  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0  ...   
2  0.073744  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0  ...   
3  0.045618  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0  ...   
4  0.076691  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0  ...   

   zuccherifici  zuckerman  zuheir  zulia  zurich  zuyuan  zverev  zwermann  \
0           0.0        0.0     0.0    0.0     0.0     0.0     0.0       0.0   
1           0.0        0.0

Compare Representations

In [79]:
doc_index = 0  # You can choose any document index
bow_nonzero_indices = X_bow[doc_index].nonzero()[1]
tfidf_nonzero_indices = X_tfidf[doc_index].nonzero()[1]

print(f"\nNon-zero BoW terms for doc {doc_index}:")
for idx in bow_nonzero_indices[:10]:  # limit output
    term = bow_vectorizer.get_feature_names_out()[idx]
    count = X_bow[doc_index, idx]
    print(f"{term}: {count}")

print(f"\nNon-zero TF-IDF terms for doc {doc_index}:")
for idx in tfidf_nonzero_indices[:10]:  # limit output
    term = tfidf_vectorizer.get_feature_names_out()[idx]
    tfidf_value = X_tfidf[doc_index, idx]
    print(f"{term}: {tfidf_value:.4f}")


Non-zero BoW terms for doc 0:
asian: 2
exporters: 3
fear: 1
damage: 2
u: 19
s: 31
japan: 13
rift: 1
mounting: 1
trade: 15

Non-zero TF-IDF terms for doc 0:
asian: 0.0631
exporters: 0.0770
fear: 0.0318
damage: 0.0556
from: 0.0297
u: 0.2342
s: 0.2743
japan: 0.2454
rift: 0.0443
mounting: 0.0356


Compare Representations (Numbers Removed)

In [80]:
doc_index_2 = 0 
bow_nonzero_indices_2 = X_bow_2[doc_index_2].nonzero()[1]
tfidf_nonzero_indices_2 = X_tfidf_2[doc_index_2].nonzero()[1]

print(f"\nNon-zero BoW terms for doc {doc_index}:")
for idx in bow_nonzero_indices_2[:10]:  # limit output
    term = bow_vectorizer_2.get_feature_names_out()[idx]
    count = X_bow_2[doc_index_2, idx]
    print(f"{term}: {count}")

print(f"\nNon-zero TF-IDF terms for doc {doc_index}:")
for idx in tfidf_nonzero_indices_2[:10]:  # limit output
    term = tfidf_vectorizer_2.get_feature_names_out()[idx]
    tfidf_value_2 = X_tfidf_2[doc_index_2, idx]
    print(f"{term}: {tfidf_value_2:.4f}")


Non-zero BoW terms for doc 0:
asian: 2
exporters: 3
fear: 1
damage: 2
u: 19
s: 31
japan: 13
rift: 1
mounting: 1
trade: 15

Non-zero TF-IDF terms for doc 0:
asian: 0.0632
exporters: 0.0772
fear: 0.0319
damage: 0.0557
from: 0.0298
u: 0.2347
s: 0.2748
japan: 0.2458
rift: 0.0444
mounting: 0.0357


## Stop Word Analysis

Basic

In [81]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words = "english", # Remove English stop words
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b\w+\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Genreate TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), # Convert sparse matrix to dense array
    columns = tfidf_vectorizer.get_feature_names_out() # Get feature names
)

# Print TF-IDF Vocabulary Size
print(f"\nTF-IDF Vocabulary Size: {len(tfidf_vectorizer.vocabulary_)}")  # Print vocabulary size
print("\nSorted TF-IDF Vocabulary:\n", sorted(tfidf_vectorizer.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary
# Print TF-IDF DataFrame
print("\nTF-IDF DataFrame:\n", tfidf_df.head())  # Print first few rows of the TF-IDF DataFrame
print(f"\nShape of TF-IDF DataFrame: {tfidf_df.shape}") # Print shape of the TF-IDF DataFrame
print(f"\nTF-IDF Representation: \n {tfidf_df}") # Print the TF-IDF representation


TF-IDF Vocabulary Size: 30661

Sorted TF-IDF Vocabulary:
 [('0', 0), ('00', 1), ('000', 2), ('0000', 3), ('00000', 4), ('0009', 5), ('001', 6), ('002', 7), ('003', 8), ('0037', 9), ('004', 10), ('005', 11), ('0053', 12), ('006', 13), ('006913', 14), ('006916', 15), ('007', 16), ('007050', 17), ('007100', 18), ('007150', 19)]

TF-IDF DataFrame:
      0   00  000  0000  00000  0009  001  002  003  0037  ...  zuckerman  \
0  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
1  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
2  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
3  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   
4  0.0  0.0  0.0   0.0    0.0   0.0  0.0  0.0  0.0   0.0  ...        0.0   

   zuheir  zulia  zurich  zuyuan  zverev  zwermann   zy  zzzz  üside  
0     0.0    0.0     0.0     0.0     0.0       0.0  0.0   0.0    0.0  
1     0.0    0.0     0.0     0.0     0.0       0.0  0

Numbers Removed

In [82]:
# Initialize TfidfVectorizer
tfidf_vectorizer_2 = TfidfVectorizer(
    stop_words = "english", # Remove English stop words
    lowercase = True, # Convert all characters to lowercase
    token_pattern = r"\b[a-zA-Z]{1,}\b",  # Token pattern to match words
    min_df = 1 # Minimum document frequency
)

# Genreate TF-IDF representation
X_tfidf_2 = tfidf_vectorizer_2.fit_transform(corpus)

# Convert to DataFrame
tfidf_df_2 = pd.DataFrame(
    X_tfidf_2.toarray(), # Convert sparse matrix to dense array
    columns = tfidf_vectorizer_2.get_feature_names_out() # Get feature names
)

# Print TF-IDF Vocabulary Size
print(f"\nTF-IDF Vocabulary Size: {len(tfidf_vectorizer_2.vocabulary_)}")  # Print vocabulary size
print("\nSorted TF-IDF Vocabulary:\n", sorted(tfidf_vectorizer_2.vocabulary_.items(), key = lambda x: x[1])[:20])  # Print sorted vocabulary
# Print TF-IDF DataFrame
print("\nTF-IDF DataFrame:\n", tfidf_df_2.head())  # Print first few rows of the TF-IDF DataFrame
print(f"\nShape of TF-IDF DataFrame: {tfidf_df_2.shape}") # Print shape of the TF-IDF DataFrame
print(f"\nTF-IDF Representation: \n {tfidf_df_2}") # Print the TF-IDF representation


TF-IDF Vocabulary Size: 28881

Sorted TF-IDF Vocabulary:
 [('aa', 0), ('aaa', 1), ('aabex', 2), ('aac', 3), ('aachener', 4), ('aagiy', 5), ('aaica', 6), ('aaix', 7), ('aam', 8), ('aame', 9), ('aancor', 10), ('aap', 11), ('aar', 12), ('aare', 13), ('aarn', 14), ('aarnoud', 15), ('aaron', 16), ('aart', 17), ('aati', 18), ('ab', 19)]

TF-IDF DataFrame:
     aa  aaa  aabex  aac  aachener  aagiy  aaica  aaix  aam  aame  ...  \
0  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0   0.0  ...   
1  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0   0.0  ...   
2  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0   0.0  ...   
3  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0   0.0  ...   
4  0.0  0.0    0.0  0.0       0.0    0.0    0.0   0.0  0.0   0.0  ...   

   zuccherifici  zuckerman  zuheir  zulia  zurich  zuyuan  zverev  zwermann  \
0           0.0        0.0     0.0    0.0     0.0     0.0     0.0       0.0   
1           0.0        0.0     0.0    0.0     0.0

## New Document Processing

In [None]:
def process_new_document(doc, vectorizer):
    """
    Process a new document using trained vectorizer
    
    Args:
        document (str): Text to process
        vectorizer: Trained CountVectorizer or TfidfVectorizer
    
    Returns:
        sparse matrix: Document vector

    """
    # Transform the document
    X_new = vectorizer.transform([doc])
    
    # Convert to DataFrame
    df_new = pd.DataFrame(
        X_new.toarray(),
        columns = vectorizer.get_feature_names_out()
    )
    
    return X_new

In [84]:
# Sample Financial Market Doc
sample_doc = """
Today brought excellent news on the economic front, fueling a strong stock market rally. 
This surge was further bolstered by signals that the Federal Reserve might hold off on 
additional interest rate increases, easing concerns for investors.
"""

# Process using trained BoW and TF-IDF vectorizers
bow_vector = process_new_document(sample_doc, bow_vectorizer)
tfidf_vector = process_new_document(sample_doc, tfidf_vectorizer)

# Compare feature counts
print(f"\n[BoW] Non-zero features: {bow_vector.nnz} / {bow_vector.shape[1]}")
print(f"[TF-IDF] Non-zero features: {tfidf_vector.nnz} / {tfidf_vector.shape[1]}")

# Show top non-zero terms from BoW
print("\nTop terms in BoW representation:")
bow_indices = bow_vector.nonzero()[1]
for idx in bow_indices:
    term = bow_vectorizer.get_feature_names_out()[idx]
    count = bow_vector[0, idx]
    print(f"{term}: {count}")

# Show top non-zero terms from TF-IDF
print("\nTop terms in TF-IDF representation:")
tfidf_indices = tfidf_vector.nonzero()[1]
for idx in tfidf_indices:
    term = tfidf_vectorizer.get_feature_names_out()[idx]
    score = tfidf_vector[0, idx]
    print(f"{term}: {score:.4f}")



[BoW] Non-zero features: 22 / 30661
[TF-IDF] Non-zero features: 22 / 30661

Top terms in BoW representation:
additional: 1
bolstered: 1
brought: 1
concerns: 1
easing: 1
economic: 1
excellent: 1
federal: 1
fueling: 1
hold: 1
increases: 1
investors: 1
market: 1
news: 1
rally: 1
rate: 1
reserve: 1
signals: 1
stock: 1
strong: 1
surge: 1
today: 1

Top terms in TF-IDF representation:
additional: 0.1757
bolstered: 0.3174
brought: 0.2151
concerns: 0.2397
easing: 0.2530
economic: 0.1494
excellent: 0.2608
federal: 0.1497
fueling: 0.3651
hold: 0.1857
increases: 0.1865
investors: 0.1859
market: 0.1146
news: 0.1719
rally: 0.2596
rate: 0.1334
reserve: 0.1661
signals: 0.2756
stock: 0.1161
strong: 0.1715
surge: 0.2466
today: 0.1253
