In [2]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/amoakoheskey/nltk_data...


True

In [3]:
import nltk
from nltk.corpus import reuters

categories = reuters.categories()
documents = reuters.fileids(categories)

print("Total Categories:", len(categories))
print("Total Documents:", len(documents))

multi_cat_docs = [doc for doc in documents if len(reuters.categories(doc)) > 2]

doc_id = multi_cat_docs[1]

print(f"Document with more than 2 Categories: {doc_id}")
print("Categories:", reuters.categories(doc_id))
print("Raw Document:\n", reuters.raw(doc_id))
print("Words:\n", " ".join(reuters.words(doc_id)))

Total Categories: 90
Total Documents: 10788
Document with more than 2 Categories: test/14840
Categories: ['coffee', 'lumber', 'palm-oil', 'rubber', 'veg-oil']
Raw Document:
 INDONESIAN COMMODITY EXCHANGE MAY EXPAND
  The Indonesian Commodity Exchange is
  likely to start trading in at least one new commodity, and
  possibly two, during calendar 1987, exchange chairman Paian
  Nainggolan said.
      He told Reuters in a telephone interview that trading in
  palm oil, sawn timber, pepper or tobacco was being considered.
      Trading in either crude palm oil (CPO) or refined palm oil
  may also be introduced. But he said the question was still
  being considered by Trade Minister Rachmat Saleh and no
  decision on when to go ahead had been made.
      The fledgling exchange currently trades coffee and rubber
  physicals on an open outcry system four days a week.
      "Several factors make us move cautiously," Nainggolan said.
  "We want to move slowly and safely so that we do not make a

In [4]:
import nltk
from nltk.corpus import reuters

categories = reuters.categories()
documents = reuters.fileids(categories)

print(f"Total Categories: {len(categories)}")
print(f"Total Documents: {len(documents)}")

multi_cat_docs = [doc for doc in documents if len(reuters.categories(doc)) == 4]

doc_id = multi_cat_docs[2]

print(f"Document with exactly 4 Categories: {doc_id}")
print("Categories:", reuters.categories(doc_id))
print("Words:\n", " ".join(reuters.words(doc_id)))

Total Categories: 90
Total Documents: 10788
Document with exactly 4 Categories: test/15341
Categories: ['corn', 'grain', 'veg-oil', 'wheat']
Words:
 USDA DISCUSSING PL 480 AGREEMENT WITH MOROCCO The U . S . Agriculture Department is currently discussing an amendment to a PL 480 agreement signed with Morocco on January 22 , but the mix of commodities under the amendment has not been determined , a U . S . Agriculture Department official said . The official noted the agreement signed in January provided for the supply of about 55 , 000 tonnes of vegetable oil , 55 , 000 tonnes of corn and 126 , 000 tonnes of wheat , all for delivery during the current fiscal year , ending this September 30 . No purchase authorizations for the commodities provided in the January agreement have been announced by the department .


In [1]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

categories = reuters.categories()[:5]
documents = reuters.fileids(categories)

text_data = [" ".join([word for word in reuters.words(fileid)]) for fileid in documents]
categories_data = [reuters.categories(fileid)[0] for fileid in documents]

# Using count vectorizer for feature extraction
count_vectorizer = CountVectorizer(max_features=1000)
X = count_vectorizer.fit_transform(text_data)
y = LabelEncoder().fit_transform(categories_data)

# Split the data for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Using BaggingClassifier with DecisionTrees
bag_classifier = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, random_state=1)
bag_classifier.fit(X_train.toarray(), y_train)

# Checking the performance of the model on test data
y_pred = bag_classifier.predict(X_test.toarray())

# Printing the first 10 values of y_test and y_pred before the classification report
print(f"First 10 y_test values: {y_test[:10]}")
print(f"First 10 y_pred values: {y_pred[:10]}")

print(classification_report(y_test, y_pred, zero_division=1))

First 10 y_test values: [0 0 0 0 0 0 4 0 0 0]
First 10 y_pred values: [0 0 0 0 0 0 1 0 0 0]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       601
           1       0.82      0.93      0.87        15
           2       1.00      1.00      1.00        12
           3       0.91      0.95      0.93        22
           4       0.90      0.75      0.82        12

    accuracy                           0.99       662
   macro avg       0.93      0.93      0.92       662
weighted avg       0.99      0.99      0.99       662



In [2]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the Reuters dataset but only consider the first 5 categories.
categories = reuters.categories()[:5]  # Select the first 5 categories
documents = reuters.fileids(categories)

text_data = [" ".join([word for word in reuters.words(fileid)]) for fileid in documents]
categories_data = [reuters.categories(fileid)[0] for fileid in documents]

# Using count vectorizer for feature extraction
count_vectorizer = CountVectorizer(max_features=1000)
X = count_vectorizer.fit_transform(text_data)
y = LabelEncoder().fit_transform(categories_data)

# Split the data for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Initialize the Bagging Classifier with a Decision Tree Classifier
# with a specific number of estimators and a given seed.
bag_classifier = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=5), n_estimators=100, random_state=42
)

bag_classifier.fit(X_train.toarray(), y_train)

# Check the performance of the model on test data
y_pred = bag_classifier.predict(X_test.toarray())
print(classification_report(y_test, y_pred, zero_division=1))




              precision    recall  f1-score   support

           0       0.98      1.00      0.99       601
           1       0.80      0.80      0.80        15
           2       1.00      0.83      0.91        12
           3       0.95      0.91      0.93        22
           4       1.00      0.67      0.80        12

    accuracy                           0.98       662
   macro avg       0.95      0.84      0.89       662
weighted avg       0.98      0.98      0.98       662

