In [1]:
import csv
import json
import sys
from collections import OrderedDict, Counter
from numpy.random import choice
import time
import math
from operator import itemgetter
import copy

def runLDA(corpus, iterations, alpha, beta):
    """An implementation of Latent Dirichlet Allocation. Probabilistically
        generates "topics" for a given corpus, each of which contains many
        words that are related by their coocurrence in the text. Uses the
        CorpusData data structure containing information about word location
        and outputs a list of the words in each topic to the shell after the
        desired number of iterations.

    Args:
        corpus (CorpusData): A data structure that has already called "loadData"
            on a text.
        iterations (int): The desired number of iterations for the LDA algorithm.
            More iterations lead to more consistent, coherent topics at the cost of
            a longer runtime.
        alpha (float): The first "hyperparameter" or "smoothing constant." Affects
            the P(w|t) calculation. When alpha is higher, documents tend to
            represent a greater variety of topics.
        beta (float): Another hyperparameter, this one affecting the P(t|d)
            calculation. A higher value for beta causes topics to contain a greater
            variety of words.

    """
    printProgressBar(0, iterations, prefix='Progress', suffix='complete', length=50)
    for i in range(0, iterations):
        # getting start time to measure runtime
        # delete the line below for the final release!
        startTime = time.clock()
        for doc in range(len(corpus.wordLocationArray)):
            for word in range(len(corpus.wordLocationArray[doc])):
                oldTopic = corpus.topicAssignmentByLoc[doc][word]
                corpus.removeWordFromDataStructures(word, doc, oldTopic)
                wordProbabilities = corpus.calculateProbabilities(doc, word, alpha, beta)
                newTopic = choice(range(len(wordProbabilities)), p=wordProbabilities)
                corpus.addWordToDataStructures(word, doc, newTopic)
        estTime = math.ceil((time.clock() - startTime) * (iterations - i) / 60)
        time.sleep(0.1)
        if i == iterations-1:
            printProgressBar(i + 1, iterations, prefix='Progress', suffix='complete', length=50)
        elif (estTime > 0):
            printProgressBar(i + 1, iterations, prefix='Progress', suffix='complete', length=50, estTimeRemaining=estTime)
        else:
            printProgressBar(i + 1, iterations, prefix='Progress', suffix='complete', length=50)