In [1]:
import guidedlda
import json
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

TOPIC_NUMS_TUPLE = (10, 30, 50, 100, 150, 200, 300)
ITERATION_NUMS = 100
TOP_K_WORDS = 25
TOP_K_DOC = 10
CACHE = False
STOP_WORDS_FILE_PATH = 'stop_word_list.txt'
PROJECT_INFO_FILE_PATH = 'projectInfo2.json'
CAMPAIGN_CACHE_PATH = '_campaign.txt'
RAW_CAMPAIGN_CACHE_PATH = '_raw_campaign.json'


def main():
    X, word2id, vocab = load_data(PROJECT_INFO_FILE_PATH)
    save_file(vocab, '', 'vocabulary')
    loglikelihoods = []
    for topic_num in TOPIC_NUMS_TUPLE:
        model = guided_analysis(X, word2id, vocab, topic_num, 0.25)
        loglikelihoods.append(model.loglikelihood())
        retrieve_words_from(model, vocab, topic_num, n_top_words)
        calculate_loading(model, topic_num, human_readable=True)
    plt.plot(list(TOPIC_NUMS_TUPLE), loglikelihoods)
    plt.show()


def guided_analysis(X, word2id, vocab, topic_num, confidence, n_top_words=TOP_K_WORDS):
    """
    Guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics=topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20)
    model.fit(X, seed_topics=load_seed_topics(word2id), seed_confidence= confidence)
    return model


def non_guided_analysis(X, vocab, topic_num, n_top_words=TOP_K_WORDS):
    """
    Non_guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics=topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20)
    model.fit(X)
    retrieve_words_from(model, vocab, topic_num, n_top_words)


def calculate_loading(model, topic_num, n_top_docs=TOP_K_DOC, human_readable=False, sorting = False):
    """
    Export the loading for the model
    """
    doc_topic = model.doc_topic_
    doc_num = doc_topic.shape[0]
    result = []
    if sorting:
        sorted_index = np.argsort(doc_topic, axis=0)  # sort the data with index row by row
        loading_set = range(doc_num - 1, doc_num - n_top_docs - 1, -1)
        if human_readable:
            data_list = read_json(RAW_CAMPAIGN_CACHE_PATH)
            campaigns = [' '.join(project['ProjectCampaign']).replace('\n', '') for project in data_list]
        else:
            campaigns = read_file(CAMPAIGN_CACHE_PATH)
        for topic_id in range(4):
            result.append("--------- Topic {} ---------".format(topic_id))
            for ranking in loading_set:
                doc_id = sorted_index[ranking, topic_id]
                result.append(
                    "Document {} {} : {}".format(doc_id, format_digit(doc_topic[doc_id, topic_id]), campaigns[doc_id]))
    else:
        result.append("          {}".format(" ".join(["doc " + str(i) for i in range(n_top_docs)])))
        for topic_id in range(4):
            result.append("Topic {} : {}".format(str(topic_id), " ".join([str(format_digit(doc_topic[doc_id][topic_id])) for doc_id in range(10)])))
    save_file(result, topic_num, 'document_loading')

    
def dt_matrix(model, n_top_docs, topic_num):
    """
    Export the document - topic matrix
    """
    doc_topic = model.doc_topic_
    result = []
    for i, docs_dist in enumerate(doc_topic):
        doc_topic_assignment = np.sort(docs_dist)[: -(n_top_docs + 1): -1]
        result.append('Document {} : {}'.format(i, ','.join(map(str, doc_topic_assignment))))
    save_file(result, topic_num, 'document_topic_matrix')


def tw_matrix(model, n_top_words, topic_num):
    """
    Export the topic- word matrix
    """
    topic_word = model.topic_word_
    result = []
    for i, word_dist in enumerate(topic_word):
        topic_word_assignment = np.sort(word_dist)[: -(n_top_words + 1): - 1]
        result.append('Topic {} : {}'.format(i, ','.join(map(str, topic_word_assignment))))
    save_file(result, topic_num, 'topic_word_matrix')


def unique_words(model, vocab, n_top_words, topic_num):
    """
    Find out the unique words for the topics
    """
    word_topic = model.word_topic_
    result = []
    for i, topic_dist in enumerate(word_topic):
        beta_dist = np.array(list(map(lambda x: float(x) / (1 - x), topic_dist)))
        # pick n_top beta for the words in the topic
        sorted_index = np.argsort(beta_dist)[: -(n_top_words + 1): - 1]
        sorted_beta = beta_dist[sorted_index]
        result.append(('{} : {}'.format(vocab[i], ','.join(
            list(map(lambda x, y: str(x) + '(' + str(y) + ')', sorted_index, sorted_beta))))))
    save_file(result, topic_num, 'word_uniqueness_matrix')


def retrieve_words_from(model, vocab, topic_num, n_top_words):
    """
    Retrieve the top k topics
    """
    topic_word = model.topic_word_
    result = []
    for i, topic_dist in enumerate(topic_word):
        if (i == 4): break;
        word_index = np.argsort(topic_dist)[:-(n_top_words + 1): -1]
        topic_words = np.array(vocab)[word_index]
        topic_words_assignment = topic_dist[word_index]
        result.append('Topic {} : {}'.format(i, ','.join(
            list(map(lambda x, y: str(x) + '(' + str(format_digit(y)) + ')', topic_words, topic_words_assignment)))))
    save_file(result, topic_num, 'topic_words_matrix')


def load_seed_topics(word2id):
    """
    Construct the seeds_topic dictionary
    :param word2id:
    :return:
    """
    seed_topic_list = {
        'team': ['crew', 'team', 'crews', 'seriously', 'experienced', 'collaborators', 'collaborator', 'programmer',
                 'programmer', 'marketing', 'brothers', 'brother', 'leadership', 'leader', 'developer', 'developers',
                 'designer', 'designers', 'leadership', 'post', 'artist', 'artists', 'writer', 'writers', 'director',
                 'directors', 'research', ' researcher', 'researchers', 'develops', 'cinematography'],
        'product': ['characteristics', 'characteristic', 'theme', 'role', 'evolution', 'feature', 'story',
                    'storytelling', 'attribute', 'attributes', 'features', 'design', 'designs', 'expansion',
                    'expansions', 'interactive', ' aspect', 'aspects', 'language', 'languages', 'gameplay', 'gear',
                    'gears', 'ammunition', 'potion', 'potions', 'experience', 'rpg', 'level', 'levels', 'venture',
                    'ventures', 'activity', 'adventure', 'adventures', 'adventurer', 'adventurers', 'character',
                    'characters', 'system', 'systems', 'job', 'jobs', 'pack', 'packs'],
        'motivation': ['reason', 'inspired', 'suggest', 'fun', 'support', 'exciting', 'opportunity',
                       'idea', 'ideas', 'excited', 'confident', 'feedback', 'ontrack'],
        'rewards': ['goals', 'goal', 'pledge', 'access', 'reward', 'level', 'rewards', 'tier', 'appreciate',
                    'appreciation', 'offer', 'bonus', 'promotion', 'price', 'share', 'bundle']
    }
    seed_topics = {}
    for tid, seeds in enumerate(seed_topic_list.values()):
        for word in seeds:
            lower_word = word.lower()
            if lower_word in word2id.keys():
                seed_topics[word2id[lower_word]] = tid
    return seed_topics


def format_digit(input):
    return '{0:.3f}'.format(input)


def get_wordnet_pos(treebank_tag):
    """
    get part of speech from tree bank ag
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def load_stopwords(filepath):
    """
    Load the stop words
    """
    stop_words = set(stopwords.words('english'))
    with open(filepath) as fp:
        for line in fp:
            stop_words.add(line.rstrip('\n'))
    return stop_words


def filter_words(tokens, stop_words):
    """
    filter the word by nltk stopwords and length
    """
    return [w for w in tokens if w not in stop_words and len(w) > 3]


def clean_text(text):
    text = re.sub(r"\S*@\S*", " ", text)  # remove email address
    text = re.sub(r"((:?http|https)://)?[-./?:@_=#\w]+\.(?:[a-zA-Z]){2,6}(?:[-\w.&/?:@_=#()])*", " ",
                  text)  # remove urls
    text = re.sub(r"[-!?=~|#$+%*&@:/(){}\[\],\"'._]", " ", text)  # remove punctuations
    text = re.sub(r"\d+", " ", text)  # remove digits
    text = re.sub(r"\b(\w)\1+\b", " ", text)  # remove meaningless word composed

    return text


def nltk_lemmatize(campaign_list):
    """
    Return the campaign list after being lemmatized
    """
    result = []
    wordnet_lemmatizer = nltk.WordNetLemmatizer()
    stop_words = load_stopwords(STOP_WORDS_FILE_PATH)
    for campaign in campaign_list:
        token_list = []
        tokens = filter_words(nltk.word_tokenize(campaign, language='english'), stop_words)
        pos_tags = nltk.pos_tag(tokens)
        for word, tag in pos_tags:
            token_list.append(wordnet_lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)))
        result.append(" ".join(token_list))
    return result


def spacy_lemmatize(campaign_list):
    """
    Return the campaign list after being lemmatized by spacy
    """
    result = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for campaign in campaign_list:
        doc = nlp(campaign)
        result.append(" ".join([token.lemma_ for token in doc if token.lemma_ != '-PRON-']))
    return result

# * -------------- Utils function ------------ *


# def save_file(data, topic_num, filename):
#     with open('%s_%s.txt' % (str(topic_num), filename), 'w') as fp:
#         for item in data:
#             fp.write(item + "\n")
#         fp.close()

def save_file(data, topic_num, filename):
    for item in data:
        print(item)


def read_file(filename):
    with open(filename, 'r') as fp:
        data_list = [line.rstrip('\n') for line in fp]
        fp.close()
    return data_list


def read_json(filename):
    with open(filename, 'r') as fp:
        data_list = list(json.load(fp))
        fp.close()
    return data_list


def dump_json(data, filename):
    with open('_' + filename + '.json', 'w') as fp:
        json.dump(data, fp)
        fp.close()


def load_data(path):
    """
    1. Load the data from file
    2. clean the text
    3. lemmatize the text
    4. extract the vocabulary from the documents
    5. convert the data to document - term matrix
    """
    if CACHE:
        campaign_list = read_file('_campaign.txt')
    else:
        data_list = read_json(path)
        campaign_list = []
        if data_list:
            raw_campaign_list = []
            cleaned_list = nltk_lemmatize(
                [clean_text((" ".join(item['ProjectCampaign'])).lower()) for item in data_list])
            for i, campaign in enumerate(cleaned_list):
                if len(campaign) > 0:
                    campaign_list.append(campaign)
                    raw_campaign_list.append(data_list[i])
            save_file(campaign_list, '', 'campaign')
            dump_json(raw_campaign_list, 'raw_campaign')
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1),
                                 lowercase=True, analyzer='word')
    X = vectorizer.fit_transform(campaign_list).toarray()
    word2id = vectorizer.vocabulary_
    vocab = vectorizer.get_feature_names()
    return X, word2id, vocab


In [15]:
loglikelihoods = []
models = []

for topic_num in TOPIC_NUMS_TUPLE:
    model = guided_analysis(X, word2id, vocab, topic_num, 0.2)
    loglikelihoods.append(model.loglikelihood())
    models.append(model)

INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 10
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2409332
INFO:guidedlda:<20> log likelihood: -1908403
INFO:guidedlda:<40> log likelihood: -1876886
INFO:guidedlda:<60> log likelihood: -1861633
INFO:guidedlda:<80> log likelihood: -1851795
INFO:guidedlda:<99> log likelihood: -1845478
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 30
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2758960
INFO:guidedlda:<20> log likelihood: -1924172
INFO:guidedlda:<40> log likelihood: -1883967
INFO:guidedlda:<60> log likelihood: -1864945
INFO:guidedlda:<80> log likelihood: -1852495
INFO:guidedlda:<99> log likelihood: -1844971
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 50
INFO:guidedlda:n_iter: 100
INFO:guid

In [16]:
for i, model in enumerate(models):
    topic_num = TOPIC_NUMS_TUPLE[i]
    print("------------------------- {}-topic Document Loading -----------------------------------".format(topic_num))
    calculate_loading(model, topic_num, human_readable=True)

------------------------- 10-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.122 0.184 0.000 0.184 0.217 0.123 0.037 0.058 0.080 0.056
Topic 1 : 0.000 0.000 0.000 0.123 0.184 0.000 0.000 0.128 0.241 0.040
Topic 2 : 0.279 0.000 0.504 0.292 0.000 0.398 0.041 0.099 0.216 0.104
Topic 3 : 0.093 0.115 0.000 0.000 0.014 0.000 0.799 0.000 0.000 0.000
------------------------- 30-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.000 0.218 0.000 0.000 0.103 0.081 0.000 0.000 0.000 0.040
Topic 1 : 0.032 0.000 0.000 0.000 0.089 0.000 0.000 0.172 0.000 0.000
Topic 2 : 0.093 0.000 0.000 0.000 0.000 0.388 0.000 0.000 0.000 0.000
Topic 3 : 0.097 0.000 0.045 0.000 0.000 0.000 0.540 0.000 0.008 0.000
------------------------- 50-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 do

In [17]:
models = []
for topic_num in TOPIC_NUMS_TUPLE:
    model = guided_analysis(X, word2id, vocab, topic_num, 0.4)
    loglikelihoods.append(model.loglikelihood())
    models.append(model)

INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 10
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2405058
INFO:guidedlda:<20> log likelihood: -1904828
INFO:guidedlda:<40> log likelihood: -1876769
INFO:guidedlda:<60> log likelihood: -1863426
INFO:guidedlda:<80> log likelihood: -1855310
INFO:guidedlda:<99> log likelihood: -1849345
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 30
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2751313
INFO:guidedlda:<20> log likelihood: -1926317
INFO:guidedlda:<40> log likelihood: -1886859
INFO:guidedlda:<60> log likelihood: -1868744
INFO:guidedlda:<80> log likelihood: -1856209
INFO:guidedlda:<99> log likelihood: -1848332
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 50
INFO:guidedlda:n_iter: 100
INFO:guid

In [18]:
for i, model in enumerate(models):
    topic_num = TOPIC_NUMS_TUPLE[i]
    print("------------------------- {}-topic Document Loading -----------------------------------".format(topic_num))
    calculate_loading(model, topic_num, human_readable=True)

------------------------- 10-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.251 0.000 0.000 0.261 0.245 0.156 0.048 0.026 0.198 0.032
Topic 1 : 0.044 0.046 0.000 0.077 0.247 0.000 0.022 0.277 0.075 0.000
Topic 2 : 0.215 0.000 0.333 0.292 0.000 0.535 0.053 0.115 0.158 0.072
Topic 3 : 0.059 0.080 0.000 0.000 0.034 0.000 0.788 0.000 0.010 0.016
------------------------- 30-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.042 0.126 0.027 0.138 0.000 0.099 0.000 0.000 0.000 0.032
Topic 1 : 0.000 0.000 0.000 0.000 0.129 0.000 0.000 0.111 0.000 0.032
Topic 2 : 0.036 0.000 0.000 0.000 0.000 0.043 0.000 0.081 0.000 0.000
Topic 3 : 0.091 0.000 0.000 0.000 0.000 0.000 0.516 0.000 0.000 0.000
------------------------- 50-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 do

In [19]:
models = []
for topic_num in TOPIC_NUMS_TUPLE:
    model = guided_analysis(X, word2id, vocab, topic_num, 0.6)
    loglikelihoods.append(model.loglikelihood())
    models.append(model)

INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 10
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2399000
INFO:guidedlda:<20> log likelihood: -1901038
INFO:guidedlda:<40> log likelihood: -1871679
INFO:guidedlda:<60> log likelihood: -1859544
INFO:guidedlda:<80> log likelihood: -1851602
INFO:guidedlda:<99> log likelihood: -1845987
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 30
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2741375
INFO:guidedlda:<20> log likelihood: -1926890
INFO:guidedlda:<40> log likelihood: -1884658
INFO:guidedlda:<60> log likelihood: -1866547
INFO:guidedlda:<80> log likelihood: -1853724
INFO:guidedlda:<99> log likelihood: -1845131
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 50
INFO:guidedlda:n_iter: 100
INFO:guid

In [20]:
for i, model in enumerate(models):
    topic_num = TOPIC_NUMS_TUPLE[i]
    print("------------------------- {}-topic Document Loading -----------------------------------".format(topic_num))
    calculate_loading(model, topic_num, human_readable=True)

------------------------- 10-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.112 0.241 0.162 0.169 0.225 0.109 0.047 0.014 0.000 0.032
Topic 1 : 0.099 0.000 0.000 0.215 0.261 0.000 0.000 0.399 0.218 0.000
Topic 2 : 0.274 0.000 0.612 0.092 0.000 0.644 0.081 0.105 0.195 0.024
Topic 3 : 0.089 0.080 0.000 0.000 0.049 0.000 0.755 0.000 0.013 0.000
------------------------- 30-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.000 0.241 0.000 0.000 0.101 0.114 0.039 0.000 0.038 0.000
Topic 1 : 0.065 0.000 0.018 0.000 0.212 0.000 0.000 0.185 0.000 0.000
Topic 2 : 0.116 0.000 0.144 0.015 0.000 0.218 0.000 0.000 0.178 0.000
Topic 3 : 0.103 0.218 0.108 0.000 0.014 0.000 0.448 0.000 0.000 0.000
------------------------- 50-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 do

In [21]:
models = []
for topic_num in TOPIC_NUMS_TUPLE:
    model = guided_analysis(X, word2id, vocab, topic_num, 0.8)
    loglikelihoods.append(model.loglikelihood())
    models.append(model)

INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 10
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2391010
INFO:guidedlda:<20> log likelihood: -1898718
INFO:guidedlda:<40> log likelihood: -1868519
INFO:guidedlda:<60> log likelihood: -1854781
INFO:guidedlda:<80> log likelihood: -1846805
INFO:guidedlda:<99> log likelihood: -1841224
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 30
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2729190
INFO:guidedlda:<20> log likelihood: -1917398
INFO:guidedlda:<40> log likelihood: -1882315
INFO:guidedlda:<60> log likelihood: -1864365
INFO:guidedlda:<80> log likelihood: -1852609
INFO:guidedlda:<99> log likelihood: -1844151
INFO:guidedlda:n_documents: 552
INFO:guidedlda:vocab_size: 17500
INFO:guidedlda:n_words: 211683
INFO:guidedlda:n_topics: 50
INFO:guidedlda:n_iter: 100
INFO:guid

In [23]:
for i, model in enumerate(models):
    topic_num = TOPIC_NUMS_TUPLE[i]
    print("------------------------- {}-topic Document Loading -----------------------------------".format(topic_num))
    calculate_loading(model, topic_num, human_readable=True)

------------------------- 10-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.089 0.310 0.090 0.246 0.243 0.128 0.034 0.009 0.088 0.056
Topic 1 : 0.055 0.000 0.000 0.169 0.190 0.000 0.018 0.303 0.140 0.040
Topic 2 : 0.393 0.000 0.774 0.123 0.123 0.317 0.080 0.295 0.271 0.000
Topic 3 : 0.125 0.138 0.000 0.000 0.038 0.000 0.786 0.021 0.030 0.000
------------------------- 30-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 doc 4 doc 5 doc 6 doc 7 doc 8 doc 9
Topic 0 : 0.030 0.069 0.000 0.107 0.184 0.180 0.000 0.000 0.000 0.056
Topic 1 : 0.000 0.000 0.009 0.215 0.223 0.000 0.000 0.125 0.215 0.000
Topic 2 : 0.063 0.000 0.350 0.199 0.000 0.090 0.000 0.004 0.120 0.000
Topic 3 : 0.072 0.172 0.000 0.000 0.040 0.000 0.348 0.057 0.018 0.024
------------------------- 50-topic Document Loading -----------------------------------
          doc 0 doc 1 doc 2 doc 3 do

In [22]:
data_list = read_json(RAW_CAMPAIGN_CACHE_PATH)
campaigns_raw = [' '.join(project['ProjectCampaign']).replace('\n', '') for project in data_list]
for i, campaign in enumerate(campaigns_raw):
    if (i == 10): break
    print ("------------------------------------------ Doucment {} -------------------------------------------------------".format(i))
    print (campaign)

------------------------------------------ Doucment 0 -------------------------------------------------------
                 Ocean Crisis is a cooperative worker placement game which depicts this real life marine ecosystem disaster caused by pollution. The game also simulates some of the crucial approaches and policies that we can adopt in reality to cope with such disaster.   Players take on the roles of the Ocean Defense Team. By cleaning up the massive amount of garbage, conducting researches and developing eco-skills, team members will strive to stop the formation of the Great Pacific Garbage Patch, a.k.a., the Garbage Island.             Within a game, players will also go through a number of environmental rescue missions and scenarios regarding marine life and the ecosystem, each providing unique mechanisms and storylines for players to challenge.       GAMEPLAY OVERVIEW                            Each Ocean Crisis game starts with 10 meeples, regardless of the numbers of playe

In [6]:
campaigns = read_file(CAMPAIGN_CACHE_PATH)
for i, campaign in enumerate(campaigns):
    if (i == 10): break
    print ("------------------------------------------ Doucment {} -------------------------------------------------------".format(i))
    print (campaign)

------------------------------------------ Doucment 0 -------------------------------------------------------
phronesis card ideation classic collector design help magician generate idea create original magic deck card design help magician unlock creativity ideation edition build proven method creativity combine question performer style motivation think starter method prop structure routine classic edition mark deck worker deck feature elegant back design ideation edition combine traditional face back fully mark suit card color aronson stack mnemonica stack subtle design collector ideation edition limit ideation deck tuck luxurious deep blue matte finish case produce industry lead tuck manufacturing expert cleave press tuck gold white foil emboss note back deck fully mark suit color aronson stack number mnemonica stack number subtle feature love honor part armstrong deck card design tool help magician creative generate idea spark innovative thought simply ideation deck interrupt ordina

In [7]:
for i, model in enumerate(models):
    if (i == 9): break
    topic_num = TOPIC_NUMS_TUPLE[i]
    print("---------------------------- {} Document words  ------------------------------------".format(topic_num))
    retrieve_words_from(model, vocab, topic_num, TOP_K_WORDS)

---------------------------- 10 Document words  ------------------------------------
Topic 0 : work(0.017),design(0.014),artist(0.010),people(0.010),child(0.009),help(0.008),world(0.007),community(0.007),designer(0.007),create(0.007),love(0.006),life(0.006),kid(0.006),music(0.006),video(0.005),learn(0.005),company(0.005),friend(0.005),school(0.005),feature(0.004),team(0.004),digital(0.004),provide(0.004),support(0.004),social(0.004)
Topic 1 : character(0.036),book(0.023),adventure(0.022),level(0.014),world(0.013),story(0.012),dungeon(0.011),create(0.010),rule(0.009),creature(0.009),reward(0.008),stretch(0.008),design(0.008),item(0.008),class(0.008),campaign(0.008),monster(0.007),master(0.006),fantasy(0.006),feature(0.006),city(0.006),set(0.006),role(0.006),magic(0.006),digital(0.006)
Topic 2 : board(0.012),start(0.011),help(0.010),base(0.009),experience(0.008),create(0.008),turn(0.007),world(0.007),work(0.007),team(0.007),support(0.007),rule(0.006),love(0.006),point(0.006),think(0.006)