In [9]:

# 1) IMPORT LIBRARIES
import nltk                        # NLTK = Natural Language Toolkit (text processing library)

# Download datasets inside NLTK (only first time)
nltk.download('reuters')          # Reuters news articles dataset
nltk.download('stopwords')        # List of common stopwords like "the, and, is"

from nltk.corpus import reuters   # Import the Reuters dataset from NLTK
from nltk.corpus import stopwords # Import English stopwords
from nltk.stem import PorterStemmer  # Stemming = reduce words ("trading" -> "trade")

import re                         # Regular expressions for text cleaning

# ---------------------------------------------------------
# 2) LOAD ALL DOCUMENTS FROM REUTERS
# ---------------------------------------------------------

docs = [reuters.raw(fid) for fid in reuters.fileids()]
# docs = a list that contains the FULL TEXT of each economic article

file_ids = reuters.fileids()
# file_ids = a list of IDs like "training/12345" identifying each article

# Let's inspect the dataset:
print("Total number of documents:", len(docs))
print("Example document ID:", file_ids[0])
print("Preview of first document:")
print(docs[0][:500])              # Print first 500 characters


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total number of documents: 10788
Example document ID: test/14826
Preview of first document:
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo


In [10]:
# 2) PREPROCESSING FUNCTION


# Load English stopwords only once
stop_words = set(stopwords.words('english'))

# Create a stemmer object
stemmer = PorterStemmer()

def preprocess(text):
    """
    Preprocess a text document.
    
    Steps:
    1. Lowercase the text
    2. Remove punctuation and numbers
    3. Split text into tokens (words)
    4. Remove stopwords (the, is, and, etc.)
    5. Apply stemming (financial -> financ)
    6. Return cleaned tokens
    """
    
    # Step 1: lowercase
    text = text.lower()

    # Step 2: remove punctuation & numbers (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Step 3: tokenize
    tokens = text.split()
    
    # Step 4 & 5: remove stopwords + stem words
    cleaned = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return cleaned


# 3) TEST THE FUNCTION ON THE FIRST DOCUMENT

sample_tokens = preprocess(docs[0])
print(sample_tokens[:30])   # show first 30 cleaned words


['asian', 'export', 'fear', 'damag', 'usjapan', 'rift', 'mount', 'trade', 'friction', 'us', 'japan', 'rais', 'fear', 'among', 'mani', 'asia', 'export', 'nation', 'row', 'could', 'inflict', 'farreach', 'econom', 'damag', 'businessmen', 'offici', 'said', 'told', 'reuter', 'correspond']


In [11]:
import json
import pickle
cleaned_docs = []

for i, doc in enumerate(docs):
    tokens = preprocess(doc)
    cleaned_docs.append(" ".join(tokens))   # join tokens back into a string

print("Done! Total cleaned docs:", len(cleaned_docs))

# save file_ids (list of strings)
with open("file_ids.json", "w") as f:
    json.dump(file_ids, f)

# save cleaned_docs (list of cleaned text strings)
with open("cleaned_docs.pkl", "wb") as f:
    pickle.dump(cleaned_docs, f)



Done! Total cleaned docs: 10788
