In [1]:
import guidedlda
import json
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

TOPIC_NUMS_TUPLE = (10, 30, 50, 100, 150, 200, 300)
ITERATION_NUMS = 100
TOP_K_WORDS = 25
STOP_WORDS_FILE_PATH = 'stop_word_list.txt'

def main():
    X, word2id, vocab = load_data('../projectInfo.json')
    save_file(vocab, '', 'vocabulary')
    loglikelihoods = [guided_analysis(X, word2id, vocab, topic_num).loglikelihood() for topic_num in TOPIC_NUMS_TUPLE]
    plt.plot(list(TOPIC_NUMS_TUPLE), loglikelihoods)
    plt.show()
    plt.savefig('loglikelihood_topicnum.png')



def guided_analysis(X, word2id, vocab, topic_num, n_top_words=TOP_K_WORDS):
    """
    Guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics=topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20)
    model.fit(X, seed_topics=load_seed_topics(word2id), seed_confidence = 0.25)
    retrieve_words_from(model, vocab, topic_num, n_top_words)
    return model


def non_guided_analysis(X, vocab, topic_num, n_top_words=TOP_K_WORDS):
    """
    Non_guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics= topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20)
    model.fit(X)
    retrieve_words_from(model, vocab, topic_num, n_top_words)
    dt_matrix(model, 10, topic_num)
    tw_matrix(model, 10, topic_num)
    # unique_words(model, vocab, 10, topic_num)


def dt_matrix(model, n_top_docs, topic_num):
    """
    Export the document - topic matrix
    """
    doc_topic = model.doc_topic_
    result = []
    for i, docs_dist in enumerate(doc_topic):
        doc_topic_assignment = np.sort(docs_dist)[: -(n_top_docs + 1): -1]
        result.append('Document {} : {}'.format(i, ','.join(map(str, doc_topic_assignment))))
    save_file(result, topic_num, 'document_topic_matrix')


def tw_matrix(model, n_top_words, topic_num):
    """
    Export the topic- word matrix
    """
    topic_word = model.topic_word_
    result = []
    for i, word_dist in enumerate(topic_word):
        topic_word_assignment = np.sort(word_dist)[: -(n_top_words + 1): - 1]
        result.append('Topic {} : {}'.format(i, ','.join(map(str, topic_word_assignment))))
    save_file(result, topic_num, 'topic_word_matrix')


def unique_words(model, vocab, n_top_words, topic_num):
    """
    Find out the unique words for the topics
    """
    word_topic = model.word_topic_
    result = []
    for i, topic_dist in enumerate(word_topic):
        beta_dist = np.array(list(map(lambda x: float(x) / (1 - x), topic_dist)))
        # pick n_top beta for the words in the topic
        sorted_index = np.argsort(beta_dist)[: -(n_top_words + 1): - 1]
        sorted_beta = beta_dist[sorted_index]
        result.append(('{} : {}'.format(vocab[i], ','.join(list(map(lambda x, y: str(x) + '(' + str(y) + ')', sorted_index, sorted_beta))))))
    save_file(result, topic_num, 'word_uniqueness_matrix')


def retrieve_words_from(model, vocab, topic_num, n_top_words):
    """
    Retrieve the top k topics
    """
    topic_word = model.topic_word_
    result = []
    for i, topic_dist in enumerate(topic_word):
        word_index = np.argsort(topic_dist)[:-(n_top_words + 1): -1]
        topic_words = np.array(vocab)[word_index]
        topic_words_assignment = topic_dist[word_index]
        result.append('Topic {} : {}'.format(i, ','.join(list(map(lambda x, y : str(x)+ '(' + str(y) + ')', topic_words, format_result(topic_words_assignment))))))
    save_file(result, topic_num, 'topic_words_matrix')

def format_result(data_list):
    return ["{0:.2f}".format(i) for i in data_list]

def load_seed_topics(word2id):
    """
    Construct the seeds_topic dictionary
    :param word2id:
    :return:
    """
    seed_topic_list = {
        'team': ['crew', 'team', 'seriously', 'experienced', 'collaborator', 'programmer',
                 'programmer', 'marketing', 'brother', 'leadership', 'leader', 'developer',
                 'designer', 'leadership', 'post', 'artist', 'writer', 'director',
                 'research', ' researcher', 'researchers', 'develop', 'cinematography'],
        'product': ['characteristic', 'theme', 'role', 'evolution', 'feature', 'story',
                    'storytelling', 'attribute', 'design', 'expansion', 'interactive',
                    ' aspect', 'aspects', 'language', 'gameplay', 'gear', 'ammunition',
                    'potion', 'experience', 'rpg', 'level', 'venture','activity',
                    'adventure', 'adventurer', 'character', 'system', 'job', 'pack'],
        'motivation': ['reason', 'inspired', 'suggest', 'fun', 'support', 'exciting', 'opportunity',
                       'idea', 'excited', 'confident', 'feedback', 'ontrack'],
        'rewards': ['goals', 'goal', 'pledge', 'access', 'reward', 'level', 'rewards', 'tier', 'appreciate',
                    'appreciation', 'offer', 'bonus', 'promotion', 'price', 'share', 'bundle']
    }
    seed_topics = {}
    for tid, seeds in enumerate(seed_topic_list.values()):
        for word in seeds:
            lower_word = word.lower()
            if lower_word in word2id.keys():
                seed_topics[word2id[lower_word]] = tid
    return seed_topics


def get_wordnet_pos(treebank_tag):
    """
    get part of speech from treebank ag
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def load_stopwords(filepath):
    """
    Load the stop words
    """
    stop_words = set(stopwords.words('english'))
    with open(filepath) as fp:
        for line in fp:
            stop_words.add(line.rstrip('\n'))
    return stop_words


def filter_words(tokens, stop_words):
    """
    filter the word by nltk stopwords and length
    """
    return [w for w in tokens if w not in stop_words and len(w) > 3]


def clean_text(text):
    text = re.sub(r"\S*@\S*", " ", text)  # remove email address
    text = re.sub(r"((:?http|https)://)?[-./?:@_=#\w]+\.(?:[a-zA-Z]){2,6}(?:[-\w.&/?:@_=#()])*", " ", text)  # remove urls
    text = re.sub(r"[-!?=~|#$+%*&@:/(){}\[\],\"'._]", " ", text)  # remove punctuations
    text = re.sub(r"\d+", " ", text) # remove digits
    text = re.sub(r"\b(\w)\1+\b", " ",text) # remove meaningless word composed

    # Facebook account name
    # Instagram account name
    # Tumblr account name
    # Word separated by &nbsp sticks together
    return text


def nltk_lemmatize(campaign_list):
    """
    Return the campaign list after being lemmatized
    """
    result = []
    wordnet_lemmatizer = nltk.WordNetLemmatizer()
    stop_words = load_stopwords(STOP_WORDS_FILE_PATH)
    for campaign in campaign_list:
        token_list = []
        tokens = filter_words(nltk.word_tokenize(campaign, language='english'), stop_words)
        pos_tags = nltk.pos_tag(tokens)
        for word, tag in pos_tags:
            token_list.append(wordnet_lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)))
        result.append(" ".join(token_list))
    return result


def spacy_lemmatize(campaign_list):
    """
    Return the campaign list after being lemmatized by spacy
    """
    result = []
    nlp = spacy.load('en', disable = ['parser', 'ner'])
    for campaign in campaign_list:
        doc = nlp(campaign)
        result.append(" ".join([token.lemma_ for token in doc if token.lemma_ != '-PRON-']))
    return result


def save_file(data, topic_num, filename):
    with open('%s_%s.txt' % (str(topic_num), filename), 'w') as fp:
        for item in data:
            fp.write(item + "\n")
        fp.close()
        
def pretty_print(data, title):
    print ("------------- %s ---------------")
    for item in data:
        print(item + "\n")

def load_data(path):
    """
    1. Load the data from file
    2. clean the text
    3. lemmatize the text
    4. extract the vocabulary from the documents
    5. convert the data to document - term matrix
    """
    with open(path, 'r') as fp:
        data_list = list(json.load(fp))
        fp.close()
    if data_list:
        campaign_list = list(filter(lambda x : len(x) > 0, nltk_lemmatize([clean_text((" ".join(item['ProjectCampaign'])).lower()) for item in data_list])))
        save_file(campaign_list, '', 'campaign')
        vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1),
                                     lowercase=True, analyzer='word')
        X = vectorizer.fit_transform(campaign_list).toarray()
        word2id = vectorizer.vocabulary_
        vocab = vectorizer.get_feature_names()
        return X, word2id, vocab


if __name__ == '__main__':
    main()



FileNotFoundError: [Errno 2] No such file or directory: '../projectInfo.json'