In [None]:
import os
from reuters_small_parser import ReutersParser
from sklearn.feature_extraction.text import TfidfVectorizer
corpora_root_path = os.path.abspath("../../corpora")
reuters_small_corpus = "reuters21578"
full_reuters_small_path = os.path.join(corpora_root_path, reuters_small_corpus)

In [None]:
def get_corpus_files(full_path):
    for root, _, file in os.walk(full_path):
        for file in file:
            if ".sgm" in file:
                # print(os.path.join(root,file))
                yield os.path.join(root,file)

In [None]:
def sample_document(vectorizer, vectorized_corpus, document_index):
    features = vectorizer.get_feature_names_out()
    for feature_index in vectorized_corpus[document_index].indices:
        print(features[feature_index])

In [None]:
parser = ReutersParser()
documents = {}
for file in get_corpus_files(full_reuters_small_path):
    with open(file, 'rb') as corpus_file:
        documents.update(parser.parse(corpus_file))
print(len(documents))

In [None]:
documents_with_topics = {document_id: document for document_id, document in documents.items() if len(document["topics"]) > 0}
print(len(documents_with_topics))

In [None]:
documents_with_topics_and_bodies = {document_id: document for document_id, document in documents_with_topics.items() if len(document["body"]) > 0}
print(len(documents_with_topics_and_bodies))

In [None]:
reuters_small_vectorizer = TfidfVectorizer(input='content', encoding="latin1", stop_words='english', min_df=0.001, max_df=0.9)
reuters_small_vectorized_corpus = reuters_small_vectorizer.fit_transform([document["body"] for document in documents_with_topics_and_bodies.values()])

In [None]:
document_corpus_index_map = {index: document_id for index, document_id in enumerate(documents_with_topics_and_bodies.keys())}


In [None]:
document_id = document_corpus_index_map[0]
document = documents_with_topics_and_bodies[document_id]
print(document["body"])
sample_document(reuters_small_vectorizer, reuters_small_vectorized_corpus, 0)

In [35]:
from collections import Counter
topic_list = os.path.join(full_reuters_small_path, "all-topics-strings.lc.txt")
with open(topic_list, "r") as topic_file:
    topics = topic_file.readlines()
    topics = [t.strip() for t in topics]
topic_counter = Counter({topic: 0 for topic in topics})
for document in documents_with_topics_and_bodies.values():
    topic_counter.update({topic: 1 for topic in document["topics"]})

print(topic_counter)

Counter({'earn': 3776, 'acq': 2210, 'money-fx': 684, 'grain': 574, 'crude': 566, 'trade': 514, 'interest': 424, 'ship': 295, 'wheat': 287, 'corn': 223, 'oilseed': 182, 'sugar': 175, 'dlr': 168, 'gnp': 153, 'coffee': 143, 'veg-oil': 136, 'gold': 133, 'money-supply': 126, 'nat-gas': 126, 'livestock': 112, 'soybean': 111, 'bop': 101, 'cpi': 101, 'copper': 77, 'carcass': 75, 'reserves': 73, 'cocoa': 68, 'jobs': 68, 'rice': 67, 'iron-steel': 65, 'cotton': 62, 'alum': 58, 'yen': 58, 'ipi': 57, 'gas': 55, 'meal-feed': 50, 'rubber': 49, 'barley': 48, 'zinc': 43, 'palm-oil': 42, 'pet-chem': 41, 'silver': 36, 'lead': 35, 'rapeseed': 35, 'sorghum': 34, 'tin': 33, 'strategic-metal': 32, 'wpi': 29, 'fuel': 28, 'hog': 26, 'soy-meal': 26, 'heat': 25, 'orange': 25, 'soy-oil': 25, 'retail': 23, 'housing': 18, 'lumber': 17, 'stg': 17, 'sunseed': 17, 'tea': 15, 'dmk': 14, 'lei': 13, 'oat': 13, 'income': 12, 'nickel': 11, 'platinum': 11, 'groundnut': 10, 'l-cattle': 9, 'jet': 8, 'rape-oil': 8, 'sun-oil': 

In [36]:
corpus_topics = {document_id: document["topics"] for document_id, document in documents_with_topics_and_bodies.items()}

In [41]:
grain_documents = []
for document_id, topics in corpus_topics.items():
    if "grain" in topics and len(topics) == 1:
        grain_documents.append(str(document_id))
print(','.join(grain_documents))

13067,13070,13458,6267,124,136,4524,4637,1299,1406,1623,1631,1731,1845,15303,15580,15914,8374,8613,8656,8686,8895,8943,14212,14313,14340,14389,14828,21123,19964,3390,5363,5408,5611,5800,5826,5972,11065,11768,11862,12338,12830,9907,16359,7154
