In [None]:
import nltk

nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\odqvi\AppData\Roaming\nltk_data...


True

In [None]:
! unzip -q /usr/share/nltk_data/corpora/reuters.zip -d /usr/share/nltk_data/corpora/

In [26]:
import os
import nltk
import numpy as np
import random
from nltk.corpus import stopwords
from collections import Counter

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

def lowercase_tokenizer(text):
    return [t.lower() for t in nltk.word_tokenize(text)]


def create_data_set(dir="training", min_freq=10, corpus_limit=200000, number_of_topics = 15):

    path = os.path.join(os.getcwd(), "reuters", dir)
    files = os.listdir(path)

    docs = []
    word_counter = Counter()
    total_words = 0

    for file in files:


        file = os.path.join(path, file)
        with open(file, 'r') as f:
            raw_file = f.readlines()

            file_words = []

            for raw_line in raw_file:
                new_words = lowercase_tokenizer(raw_line)
                file_words.extend(new_words)

            docs.append(file_words)
            word_counter.update(file_words)
            total_words += len(file_words)

        if total_words > corpus_limit:
            break

    uncommon_words = [item for item, count in word_counter.items() if count <= min_freq]

    stop_words = set(stopwords.words("english"))

    words_to_remove = stop_words.union(set(uncommon_words))

    topic_map = {}

    for i, doc in enumerate(docs):
        new_doc = []
        j = 0
        for word in doc:
            if word not in words_to_remove:
                new_doc.append(word)    
                topic_map[f"{i},{j}"] = np.random.randint(0, number_of_topics)
                j += 1
        docs[i] = new_doc

    return docs, topic_map




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\odqvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\odqvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\odqvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [31]:
def create_word_mappings(docs):
    words = set()
    for doc in docs:
        words.update(doc)
    str_to_int = {}
    int_to_str = {}
    for i, word in enumerate(words):
        str_to_int[word] = i
        int_to_str[i] = word
    return str_to_int, int_to_str

def get_d_n_k(docs, topic_map, k):
    n_d_k = np.zeros((len(docs),k))
    for i, doc in enumerate(docs):
        for j in range(len(doc)):
            topic = topic_map[f"{i},{j}"]
            n_d_k[i, topic] += 1
    return n_d_k

def get_m_k_v(docs, topic_map, k, str_to_int):
    m_k_v = np.zeros((k, len(str_to_int)))
    for i, doc in enumerate(docs):
        for j, word in enumerate(doc):
            topic = topic_map[f"{i},{j}"]
            m_k_v[topic, str_to_int[word]] += 1
            
    return m_k_v



In [32]:
k = 15
docs, topic_map = create_data_set(number_of_topics=k)
d = len(docs)

str_to_int, int_to_str = create_word_mappings(docs)

print(len(str_to_int))
print(d)
print(docs[0])
print(len(docs))
print(topic_map["3,5"])

1746
1367
['cocoa', 'review', 'continued', 'week', 'cocoa', ',', 'drought', 'since', 'early', 'january', 'prospects', 'coming', ',', 'although', 'normal', 'levels', ',', 'said', 'weekly', 'review', '.', 'dry', 'period', 'means', 'late', 'year', '.', 'week', 'ended', 'february', '22', 'bags', '60', 'making', 'cumulative', 'total', 'season', 'mln', 'stage', 'last', 'year', '.', 'seems', 'cocoa', 'earlier', 'included', 'figures', '.', 'said', 'still', 'much', 'old', 'crop', 'cocoa', 'still', 'available', 'come', 'end', '.', 'total', 'crop', 'estimates', 'around', 'mln', 'bags', 'sales', 'almost', 'mln', 'bags', 'still', 'farmers', ',', ',', 'exporters', '.', 'much', 'cocoa', 'would', 'export', 'certificates', '.', 'view', 'lower', 'quality', 'recent', 'weeks', 'farmers', 'sold', 'good', 'part', 'cocoa', 'held', '.', 'said', 'prices', 'rose', 'per', '15', '.', 'offer', 'nearby', 'shipment', 'limited', 'sales', 'march', 'shipment', 'dlrs', 'per', 'tonne', 'named', '.', 'new', 'crop', 'sales