# Visualize the Keywords using LDA Algorithm

In [5]:
import pandas as pd
import gensim
import gieba

from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE

from wordcloud import WordCloud

from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

ModuleNotFoundError: No module named 'gieba'

# Prepare the Dataset

In [3]:
def read_json_dataset(filename):
    """
    Read JSON dataset and return the job requirements
    
    Returns:
        job requirements
    """
    dataset = pd.read_json(filename)
    
    return dataset['requirement'] + ' ' + dataset['requirement_others']


def tokenizer(doc):
    return gieba.cut(doc, cut_all=False)


def get_chinese_stopwords():
    """
    Get Chinese stopwords
    """
    with open('./中文停用词表.txt') as f:
        lines = f.readlines()
        stopwords_chinese = [w.strip() for w in lines]
        
    return stopwords_chinese

    
def preprocess(raw_docs):
    """
    Normalize, tokenize, remove stopwords, use custom dictionary
    
    Args:
        raw_docs (list(str)):
        
    Returns:
        docs (list(list(str))): list of tokens in a document
    """
    
    docs = []
    
    # define customized stopwords
    stopwords_custom = ['•', '與', '★', '●', '（', '’', '－', '✦', '◆', '◼', '✪', 
                        '※', '⁺', '', '', '·', '‧', '・', '）', '○', '】', '【', '✓', '']
    
    stopwords_chinese = get_chinese_stopwords()
    
    for d in tqdm(raw_docs):
        # Normalize English words
        d = d.lower()
        
        tokens = []
        
        for t in tokenizer(d):  
            # Strip English punctuations
            t = gensim.parsing.preprocessing.strip_punctuation(t)
            
            # Remove numbers
            t = gensim.parsing.preprocessing.strip_numeric(t)
            
            t = t.strip()
            
            if t is '':
                continue
                
            if t not in stopwords_custom:
                if t not in stopwords_chinese:
                    if t not in gensim.parsing.preprocessing.STOPWORDS:
                        tokens.append(t)
                    
        docs.append(tokens)
        
    return docs


def get_dictionary(docs):
    return gensim.corpora.Dictionary(docs)


def get_corpus_bow(docs, dictionary):
    """
    Get corpus with format of BOW
    
    Args:
        docs (list(list(str))): list of list of string token
        dictionary (gensim.corpora.Dictionary): dictionary
        
    Returns:
        bows (list(list(tuple(token_id, num_tokens))))
    """
    bows = [dictionary.doc2bow(d) for d in docs]
    
    return bows


def get_corpus_tfidf(bows):
    """
    Get corpus based on TF-IDF and TF-IDF model
    
    Args:
        bows (list(list(tuple(token_id, num_tokens)))): BOW
        
    Returns:
        corpus_tfidf (gensim.interfaces.TransformedCorpus): corpus class
        tfidf_model: 
    """
    tfidf_model = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf_model[bows]
    
    return corpus_tfidf, tfidf_model


In [4]:
filename_dataset = '../crawler/employment_website_104/dataset/jobs_104.json'
no_below = 6
no_above = 0.1

raw_docs = read_json_dataset(filename_dataset)

# Normalize, tokenize, remove stopwords, use custom dictionary
docs = preprocess(raw_docs)

# Create dictionary
dictionary = get_dictionary(docs)
print('Size of dictionary before filtering out extreme words: {}'.format(len(dictionary)))

# Remove common/rare words
dictionary.filter_extremes(no_below=no_below, no_above=no_above)
print('Size of dictionary after filtering out extreme words: {}'.format(len(dictionary)))

# generate BOW corpus
corpus_bow = get_corpus_bow(docs, dictionary)

# generate TF-IDF corpus using BOW
corpus_tfidf = get_corpus_tfidf(corpus_bow)

  0%|          | 0/996 [00:00<?, ?it/s]


NameError: name 'gieba' is not defined

# Main Training Program

In [None]:
# train BOW model
lda_model_bow = gensim.models.LdaModel(corpus_bow,
                                       num_topics=num_topics,
                                       id2word=dictionary,
                                       passes=passes)

# train TF-IDF model
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                        num_topics=num_topics,
                                        id2word=dictionary,
                                        passes=passes)

# Save Models

In [None]:
filename_prefix_bow = './model_bow.lda'
filename_preifx_tfidf = './model_tfidf.lda'

In [None]:

lda_model_bow.save(filename_prefix_bow)
lda_model_tfidf.save(filename_preifx_tfidf)

# Restore Models

In [None]:
lda_model_bow = gensim.models.LdaModel.load(filename_prefix_bow)
lda_model_tfidf = gensim.models.LdaModel.load(filename_preifx_tfidf)

# Display Key Factors of Each Topic

In [None]:
def get_keywords_df(model):
    init_values = [['']*num_topics]*10
    keywords_df = pd.DataFrame(init_values, columns=['topic{}'.format(i) for i in range(num_topics)])


    for i_topic, topic in model.show_topics(-1, formatted=False):
        for i_word, (word, weight) in enumerate(topic):
            keywords_df.iloc[i_word][i_topic] = word
            
    return keywords_df
    
    
def plot_word_cloud(keywords_df):
    num_topics = len(keywords_df.iloc[0])
    
    fig, axs = plt.subplots(num_topics)
    fig.set_figheight(15)
    fig.set_figwidth(15)

    for i_topic in range(num_topics):
        text = ' '.join(w for w in keywords_df.iloc[:, i_topic])
        print('Topic {}: {}'.format(i_topic, text))
        wordCloud = WordCloud(background_color='white').generate(text)

        axs[i_topic].imshow(wordCloud)
        axs[i_topic].axis('off')
        axs[i_topic].set_title('Topic {}'.format(i_topic), size=20)

In [None]:
keywords_bow_df = get_keywords_df(lda_model_bow)
plot_word_cloud(keywords_bow_df)

In [None]:
keywords_df = get_keywords_df(lda_model_tfidf)
plot_word_cloud(keywords_df)

# Classify All Documents

In [None]:
def argmax_prob(topic_probs):
    """
    Get the index with max probability
    
    Args:
        topic_probs (list of (topic_id, topic_probability)):
        
    Returns:
        index (int): index with maximum probability
    """
    max_value = -1
    index_max_prob = 0
    
    for index, value in topic_probs:
        if value > max_value:
            max_value = value
            index_max_prob = index
            
    return index_max_prob        
    
    
def get_topic_probs(corpus, model):
    """
    Get topics of corpus
    
    Args:
        corpus
        model
    """
    topic_probs = []
    
    for c in corpus:
        results = model[c]
        
        probs = []
        for index, p in results:
            probs.append(p)
            
        topic_probs.append(probs)
        
    return topic_probs

In [None]:
prob_bows = get_topic_probs(corpus_bows, lda_model_bow)
topic_bows = np.argmax(prob_bows, axis=1)

print(prob_bows[:5])
print(topic_bows[:5])

In [None]:
prob_tfidf = get_topic_probs(corpus_tfidf, lda_model_tfidf)
topic_tfidf = np.argmax(prob_tfidf, axis=1)

print(prob_tfidf[:5])
print(topic_bows[:5])

# Display Probabilistic Results Using t-SNE

In [None]:
cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

labels = ['Topic {}'.format(i) for i in topic_tfidf]
topic_colors = [cluster_colors[i] for i in topic_tfidf]

settings = dict(x=X_tsne[:, 0],
               y = X_tsne[:, 1],
                label=labels,
                color=topic_colors,
               content=raw_docs[:])

source = ColumnDataSource(settings)

title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source, 
                 color='color', alpha=0.8, size=10)#'msize', )

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)