# Visualize the Keywords using LDA Algorithm

In [1]:
import pandas as pd
import gensim
import jieba

from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE

from wordcloud import WordCloud

from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()


### BEGIN settings of pyLDAvis ### 
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
### END settings of pyLDAvis ###

# Prepare the Dataset

In [2]:
def read_json_dataset(filename):
    """
    Read JSON dataset and return the job requirements
    
    Returns:
        job requirements
    """
    dataset = pd.read_json(filename)
    
    return dataset['requirement'] + ' ' + dataset['requirement_others']


def tokenizer(doc):
    return jieba.cut(doc, cut_all=False)


def get_chinese_stopwords():
    """
    Get Chinese stopwords
    """
    with open('./中文停用词表.txt') as f:
        lines = f.readlines()
        stopwords_chinese = [w.strip() for w in lines]
        
    return stopwords_chinese

    
def preprocess(raw_docs):
    """
    Normalize, tokenize, remove stopwords, use custom dictionary
    
    Args:
        raw_docs (list(str)):
        
    Returns:
        docs (list(list(str))): list of tokens in a document
    """
    
    docs = []
    
    # define customized stopwords
    stopwords_custom = ['•', '與', '★', '●', '（', '’', '－', '✦', '◆', '◼', '✪', 
                        '※', '⁺', '', '', '·', '‧', '・', '）', '○', '】', '【', '✓', '']
    
    stopwords_chinese = get_chinese_stopwords()
    
    for d in tqdm(raw_docs):
        # Normalize English words
        d = d.lower()
        
        tokens = []
        
        for t in tokenizer(d):  
            # Strip English punctuations
            t = gensim.parsing.preprocessing.strip_punctuation(t)
            
            # Remove numbers
            t = gensim.parsing.preprocessing.strip_numeric(t)
            
            t = t.strip()
            
            if t is '':
                continue
                
            if t not in stopwords_custom:
                if t not in stopwords_chinese:
                    if t not in gensim.parsing.preprocessing.STOPWORDS:
                        tokens.append(t)
                    
        docs.append(tokens)
        
    return docs


def get_dictionary(docs):
    return gensim.corpora.Dictionary(docs)


def get_corpus_bow(docs, dictionary):
    """
    Get corpus with format of BOW
    
    Args:
        docs (list(list(str))): list of list of string token
        dictionary (gensim.corpora.Dictionary): dictionary
        
    Returns:
        bows (list(list(tuple(token_id, num_tokens))))
    """
    bows = [dictionary.doc2bow(d) for d in docs]
    
    return bows


def get_corpus_tfidf(bows):
    """
    Get corpus based on TF-IDF and TF-IDF model
    
    Args:
        bows (list(list(tuple(token_id, num_tokens)))): BOW
        
    Returns:
        corpus_tfidf (gensim.interfaces.TransformedCorpus): corpus class
        tfidf_model: 
    """
    tfidf_model = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf_model[bows]
    
    return corpus_tfidf, tfidf_model


In [3]:
filename_dataset = '../crawler/employment_website_104/dataset/jobs_104.json'
no_below = 6
no_above = 0.1
passes = 20
iterations = 400

In [4]:
raw_docs = read_json_dataset(filename_dataset)

# Normalize, tokenize, remove stopwords, use custom dictionary
docs = preprocess(raw_docs)

# Create dictionary
dictionary = get_dictionary(docs)
print('Size of dictionary before filtering out extreme words: {}'.format(len(dictionary)))

# Remove common/rare words
dictionary.filter_extremes(no_below=no_below, no_above=no_above)
print('Size of dictionary after filtering out extreme words: {}'.format(len(dictionary)))

# generate BOW corpus
corpus_bow = get_corpus_bow(docs, dictionary)

# generate TF-IDF corpus using BOW
corpus_tfidf, tfidf_model = get_corpus_tfidf(corpus_bow)

  0%|          | 0/996 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.965 seconds.
Prefix dict has been built succesfully.
100%|██████████| 996/996 [00:07<00:00, 133.65it/s]


Size of dictionary before filtering out extreme words: 10764
Size of dictionary after filtering out extreme words: 2444


In [5]:
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

# Choose Number of Topics of BOW using pyLDAvis

In [27]:
num_topics = 4

# train BOW model
lda_model_bow = gensim.models.LdaModel(corpus_bow,
                                       num_topics=num_topics,
                                       id2word=dictionary,
                                       passes=passes,
                                      iterations=iterations,
                                      alpha='auto', eta='auto')


In [28]:
pyLDAvis.gensim.prepare(lda_model_bow, corpus_bow, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Select Num of Topics of TF-IDF using pyLDAvis

In [23]:
num_topics = 4
# train TF-IDF model
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                        num_topics=num_topics,
                                        id2word=dictionary,
                                        passes=passes,
                                        iterations=iterations,
                                      alpha='auto', eta='auto')

In [24]:
pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
