# Inspect Data of Employment Website 104

In [58]:
import pandas as pd
import gensim
import nltk
nltk.download('wordnet')

import jieba
from sklearn.manifold import TSNE

from tqdm import tqdm
import time

from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Read Data

In [2]:
filename_dataset = './datasets/jobs_104.json'

dataset_df = pd.read_json(filename_dataset)

In [3]:
dataset_df.shape

(996, 8)

In [4]:
dataset_df.head()

Unnamed: 0,address,benefit,company,experience,requirement,requirement_others,salary,title
0,台北市內湖...,福利健全 ■ 優渥的薪資、獎金 ◎ 端午節、中秋節、年終獎金。 ◎ 定期依工作績效調...,芯鼎科技股份有限公司,不拘,- Driver development and fine tune of customiz...,"- A BS degree in Computer Engineering, Electri...",待遇面議 ...,系統應用工程師-台北(CD2320) ...
1,台北市大安...,＊獎金福利＊1. 年節禮金2. 生日禮金3. 年終分紅4. 人才介紹獎金＊休假福利＊1. ...,銓鴻資訊有限公司,1年以上,1. 研究市場數據和交易數據，並進行交易策略研發。 2. 開發模型和工具，監控分析交易程序...,將先以電子履歷篩選增加效率，請注意以下事項： ■ 履歷請詳述相關經驗並請提供 Github/...,"年薪 1,000,000~2,500,000元 ...",計量交易員 Quantitative Tr...
2,新北市新店...,《薪酬福利＠HTC》我們極力落實利潤與員工分享的精神，透過多元的薪酬組合與完善的福利方案，...,宏達電 HTC Corporation_宏達國際電子股份有限公司,不拘,Join the creative thinkers at HTC Healthcare. ...,Requirement:-MS or PhD in computer science or ...,待遇面議 ...,(RD S/W) DeepQ - Comp...
3,桃園市中壢...,正職/全職人員之福利與訓練制度： 【薪酬福利】 -每年依市場薪資水準評估薪資調整專案 -...,OK超商_來來超商股份有限公司,不拘,OK超商認為，用心經營的企業，首重人才培訓。 因此，我們投入了大量的訓練資源，積極培育新一...,1.具服務熱忱，科系、經驗不拘，須輪班。 2.須自備交通工具及電腦網路設備(透過e-Lear...,"月薪 31,000~33,000元 ...",儲備幹部(大桃園地區) ...
4,新竹縣竹北...,【工作福利】彈性上下班。 【休假福利】週休二日、國定假日、婚假、產假、陪產假、特休假、彈...,神亞科技股份有限公司,3年以上,影像相關之深度學習演算法研究與開發,1. 有開發image processing相關應用經驗 2. 熟悉深度學習相關算法（e.g...,待遇面議 ...,AI 工程師 ...


In [5]:
requirements = dataset_df['requirement'] + ' ' + dataset_df['requirement_others']
requirements[:2]

0    - Driver development and fine tune of customiz...
1    1. 研究市場數據和交易數據，並進行交易策略研發。  2. 開發模型和工具，監控分析交易程序...
dtype: object

In [6]:
requirements[0]

'- Driver development and fine tune of customized HW accelerator for OpenCV/OpenCL/OpenVX/CNN.- iCatchOS(RTOS) vision process framework development and maintain (SDK/BSP)- Integration and maintain of in-house design algorithms and performance fine tune.- Discuss/Design & co-work with customers for vision related requests. - A BS degree in Computer Engineering, Electrical Engineering, or Computer Science- At least 1-3 years of embedded software development experience (Senior – more than 5 years) - Familiar with C/C++ programming language, nice to have experience in Assembly language- Knowledge of Real-Time concepts and have RTOS experience (ThreadX , FreeRTOS, eCos, RTLinux, …)- Good understanding of CPU (e.g. ARM, MIPS ..)  architecture- Image processing & analysis algorithm, Machine Learning, Deep Learning framework development experience, e.g. CNN, OpenCL, OpenVX, OpenCV, OpenGL- Self-motivated, good attitude, skilled in meeting deadlines and work collaboratively in a team environmen

In [7]:
requirements[1]

'1. 研究市場數據和交易數據，並進行交易策略研發。  2. 開發模型和工具，監控分析交易程序運行情況研究，實施統計套利策略，並研發統計套利回測工具。  3. 熟悉python R 或者c++,有自主研發量化策略的能力。 4. 具備機率、統計、機器學習和金融市場研究經驗 。  求才若渴 如果你(妳)有下列特質：  是個懶人： 希望以最小的力氣達成目標，甚至提升品質，懂得善用工具增加生產力，重構提煉自有的程式庫。  有團隊精神： 懂得協同合作提升整體效益，將團隊目標置於個人之前。  充滿好奇心： 喜歡接觸、學習新事物，發現各種創新的可能。  熱愛程式： 充滿熱情想改變世界，將巧思及創造力注入每一行程式碼。   歡迎加入我們！ 一起邁向偉大航道 : ) 將先以電子履歷篩選增加效率，請注意以下事項： ■ 履歷請詳述相關經驗並請提供 Github/Sample Code/作品，加速審核 。 ■ 我們非常重視求職者的職務申請，履歷將由相關團隊主管嚴謹評估，因履歷眾多，意者我們將主動聯繫，請求職者悉知並耐心等候。'

# Preprocess
- Add custom Chinese phrases
    - Reference: https://github.com/fxsjy/jieba
- Tokenize Chinese/English
    - For English: Normalize, Lemmatize, Remove English Stopwords
    - For Chinese: Remove Chinese Stopwords,

In [45]:
def tokenizer(doc):
    return jieba.cut(doc, cut_all=False)

def set_chinese_custom_dictionary():
    filename_userdict = './dictionary/custom_dictionary.txt'
    jieba.load_userdict(filename_userdict)


def get_chinese_stopwords():
    """
    Get Chinese stopwords
    """
    
    filename = './dictionary/中文停用词表.txt'
    with open(filename) as f:
        lines = f.readlines()
        
        stopwords = [w.strip() for w in lines]
        
    return stopwords


def preprocess(raw_docs):
    """
    Normalize, tokenize, remove stopwords, use custom dictionary
    
    Args:
        raw_docs (list(str)):
        
    Returns:
        docs (list(list(str))): list of tokens in a document
    """
    
    docs = []
    
    # Declare lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # define customized stopwords
    stopwords_custom = ['•', '與', '★', '●', '（', '’', '－', '✦', '◆', '◼', '✪', 
                        '※', '⁺', '', '', '·', '‧', '・', '）', '○', '】', '【', '✓', '']
    
    stopwords_chinese = get_chinese_stopwords()
    
    set_chinese_custom_dictionary()
    
    for d in tqdm(raw_docs):
        
        d = d.lower()
        
        tokens = []
        
        for t in tokenizer(d):
            # Strip English Punctuation
            t = gensim.parsing.preprocessing.strip_punctuation(t)
            
            # Remove numeric
            t = gensim.parsing.preprocessing.strip_numeric(t)
            
            t = t.strip()
            
            if t is '':
                continue
                
            if t not in stopwords_custom:
                if t not in stopwords_chinese:
                    if t not in gensim.parsing.preprocessing.STOPWORDS:
                        # Only Lemmatize the plura because most Chinese user don't use past tense in job description.
                        t = lemmatizer.lemmatize(t, pos='n')
                        tokens.append(t)
                        
        docs.append(tokens)
        
    return docs

In [46]:
docs = preprocess(requirements)

100%|██████████| 996/996 [00:07<00:00, 137.75it/s]


In [47]:
print(docs[0][:100])

['driver', 'development', 'fine', 'tune', 'customized', 'hw', 'accelerator', 'opencv', 'opencl', 'openvx', 'cnn', 'icatchos', 'rtos', 'vision', 'process', 'framework', 'development', 'maintain', 'sdk', 'bsp', 'integration', 'maintain', 'house', 'design', 'algorithm', 'performance', 'fine', 'tune', 'discus', 'design', 'work', 'customer', 'vision', 'related', 'request', 'b', 'degree', 'engineering', 'electrical', 'engineering', 'science', 'year', 'embedded', 'software', 'development', 'experience', 'senior', '–', 'year', 'familiar', 'c', 'c', 'programming', 'language', 'nice', 'experience', 'assembly', 'language', 'knowledge', 'real', 'time', 'concept', 'rtos', 'experience', 'threadx', 'freertos', 'ecos', 'rtlinux', '…', 'good', 'understanding', 'cpu', 'e', 'g', 'arm', 'mips', 'architecture', 'image', 'processing', 'analysis', 'algorithm', 'machine', 'learning', 'deep', 'learning', 'framework', 'development', 'experience', 'e', 'g', 'cnn', 'opencl', 'openvx', 'opencv', 'opengl', 'self', 

In [48]:
print(docs[1][:120])

['研究', '市場', '數據', '交易', '數據', '並進行', '交易', '策略', '研發', '開發', '模型', '工具', '監控', '分析', '交易', '程序', '運行', '情況', '研究', '實施', '統計', '套利', '策略', '並', '研發', '統計', '套利', '回測', '工具', '熟悉', 'python', 'r', 'c', '自主', '研發', '量化', '策略', '能力', '具備', '機率', '統計', '機器學習', '金融', '市場', '研究', '經驗', '求才若渴', '妳', '下列', '特質', '個', '懶人', '希望', '最小', '力氣', '達成', '目標', '提升', '品質', '懂得', '善用', '工具', '增加', '生產力', '重構', '提煉', '自有', '程式', '庫', '有團隊', '精神', '懂得', '協同', '合作', '提升', '整體', '效益', '將團隊', '目標', '置於', '個', '之前', '充滿', '好奇心', '喜歡', '接觸', '學習', '新事物', '發現', '各種', '創新', '可能', '熱愛', '程式', '充滿', '熱情', '想', '改變', '世界', '將', '巧思', '創造力', '注入', '一行', '程式', '碼', '歡迎', '加入', '我們', '一起', '邁向偉大', '航道', '將先以', '電子', '履歷', '篩選', '增加', '效率', '請', '注意']


# Build Word2Vec

In [55]:
size_feature = 150
window = 10
min_count = 2
# ingnore words with total frequency lower than this value.

model = gensim.models.Word2Vec(size=size_feature,
                              window=window,
                              min_count=min_count,
                              sg=1, # 1:Skip-Gram. 0:BOW
                              workers=2)

model.build_vocab(docs)

# Inspect the Results

In [56]:
print('Size of vocab: {}'.format(len(model.wv.vocab)))
print()
print('The first 100 words:\n{}'.format(list(model.wv.vocab.keys())[:100]))

Size of vocab: 6508

The first 100 words:
['driver', 'development', 'fine', 'tune', 'customized', 'hw', 'accelerator', 'opencv', 'opencl', 'openvx', 'cnn', 'rtos', 'vision', 'process', 'framework', 'maintain', 'sdk', 'bsp', 'integration', 'house', 'design', 'algorithm', 'performance', 'discus', 'work', 'customer', 'related', 'request', 'b', 'degree', 'engineering', 'electrical', 'science', 'year', 'embedded', 'software', 'experience', 'senior', '–', 'familiar', 'c', 'programming', 'language', 'nice', 'assembly', 'knowledge', 'real', 'time', 'concept', 'freertos', '…', 'good', 'understanding', 'cpu', 'e', 'g', 'arm', 'architecture', 'image', 'processing', 'analysis', 'machine', 'learning', 'deep', 'opengl', 'self', 'motivated', 'attitude', 'skilled', 'meeting', 'deadline', 'collaboratively', 'team', 'environment', 'ability', 'pressure', 'tight', 'frame', 'solve', 'technical', 'issue', 'strong', 'communication', 'skill', 'involve', 'interaction', '研究', '市場', '數據', '交易', '並進行', '策略', '研發'

# Train Program

In [62]:
%time results = model.train(sentences=docs, \
                     total_examples=len(docs), \
                     epochs=30, \
                     report_delay=1)

print(results)

CPU times: user 50.4 s, sys: 11.7 ms, total: 50.4 s
Wall time: 26 s
(3421946, 3729720)


# Save Model

In [64]:
filename_model = './models/website_104.word2vec'

model.save(filename_model)

# Restore Model

In [65]:
model = gensim.models.Word2Vec.load(filename_model)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Test Words

In [66]:
word = 'machine'
results = model.wv.most_similar(word)

words = []
for w, prob in results:
    print(w, end=', ')
    
results

learning, deep, scene, bioinformatics, mining, cmm, 或開, intellengent, minimal, vision, 

[('learning', 0.6342628002166748),
 ('deep', 0.4663415551185608),
 ('scene', 0.40456992387771606),
 ('bioinformatics', 0.39270511269569397),
 ('mining', 0.38950565457344055),
 ('cmm', 0.3879614472389221),
 ('或開', 0.3879144787788391),
 ('intellengent', 0.3867086172103882),
 ('minimal', 0.385672003030777),
 ('vision', 0.38516154885292053)]

In [67]:
word = 'learning'
results = model.wv.most_similar(word)

words = []
for w, prob in results:
    print(w, end=', ')
    
results

machine, deep, onnx, supervised, reinforcement, original, unsupervised, dnn, tx, —, 

[('machine', 0.6342628002166748),
 ('deep', 0.6074545383453369),
 ('onnx', 0.4455774128437042),
 ('supervised', 0.4227384924888611),
 ('reinforcement', 0.41758525371551514),
 ('original', 0.4151538610458374),
 ('unsupervised', 0.41345110535621643),
 ('dnn', 0.41133540868759155),
 ('tx', 0.4099203050136566),
 ('—', 0.409529447555542)]

In [68]:
word = 'cnn'
results = model.wv.most_similar(word)

words = []
for w, prob in results:
    print(w, end=', ')
    
results

rnn, lstm, rcnn, resnet, bnn, 實作練習, gan, twn, vggnet, yolo, 

[('rnn', 0.8737841248512268),
 ('lstm', 0.6624916195869446),
 ('rcnn', 0.6513897180557251),
 ('resnet', 0.6389967799186707),
 ('bnn', 0.6236380338668823),
 ('實作練習', 0.6181016564369202),
 ('gan', 0.6104133725166321),
 ('twn', 0.608122706413269),
 ('vggnet', 0.6025672554969788),
 ('yolo', 0.5942688584327698)]

In [77]:
word = 'svm'
results = model.wv.most_similar(word)

words = []
for w, prob in results:
    print(w, end=', ')
    
results

knn, random, forest, multivariable, calculus, boosting, 回歸, natworks, sklearn, 聚類, 

[('knn', 0.729316234588623),
 ('random', 0.7163858413696289),
 ('forest', 0.7103894948959351),
 ('multivariable', 0.6472737789154053),
 ('calculus', 0.6351724863052368),
 ('boosting', 0.6249902248382568),
 ('回歸', 0.6242859959602356),
 ('natworks', 0.6240625977516174),
 ('sklearn', 0.6063175797462463),
 ('聚類', 0.6061213612556458)]

In [80]:
word = '深度'
results = model.wv.most_similar(word)

words = []
for w, prob in results:
    print(w, end=', ')
    
results

recursive, 計學習, 如統, 推理, 貝葉斯, 神經, 常用, bnn, 熱誠與, 並應用, 

[('recursive', 0.7047215104103088),
 ('計學習', 0.7027356028556824),
 ('如統', 0.7012736201286316),
 ('推理', 0.6592531204223633),
 ('貝葉斯', 0.6202261447906494),
 ('神經', 0.6144611239433289),
 ('常用', 0.6061349511146545),
 ('bnn', 0.5997304320335388),
 ('熱誠與', 0.5984153151512146),
 ('並應用', 0.5829022526741028)]

# Display Similar Words using t-SNE

In [81]:
similar_words = []
key_words = ['machine', 'learning', 'cnn', 'svm', '深度']

for key_w in key_words:
    similar_words.append(key_w)
    
    for s_w, _ in model.wv.most_similar(key_w):
        similar_words.append(s_w)
        
print(similar_words)

['machine', 'learning', 'deep', 'scene', 'bioinformatics', 'mining', 'cmm', '或開', 'intellengent', 'minimal', 'vision', 'learning', 'machine', 'deep', 'onnx', 'supervised', 'reinforcement', 'original', 'unsupervised', 'dnn', 'tx', '—', 'cnn', 'rnn', 'lstm', 'rcnn', 'resnet', 'bnn', '實作練習', 'gan', 'twn', 'vggnet', 'yolo', 'svm', 'knn', 'random', 'forest', 'multivariable', 'calculus', 'boosting', '回歸', 'natworks', 'sklearn', '聚類', '深度', 'recursive', '計學習', '如統', '推理', '貝葉斯', '神經', '常用', 'bnn', '熱誠與', '並應用']


In [82]:
total_words = similar_words
features = model.wv.__getitem__(total_words)

print(total_words[:10])
print('Length = {}\nfeature size = {}'.format(features.shape[0], features.shape[1]))
print('feature max: {}, min: {}'.format(features.max(), features.min()))

['machine', 'learning', 'deep', 'scene', 'bioinformatics', 'mining', 'cmm', '或開', 'intellengent', 'minimal']
Length = 55
feature size = 150
feature max: 1.5080113410949707, min: -1.600242018699646


In [83]:
SEED = 0
tsne = TSNE(perplexity=20, n_components=2, random_state=SEED)
X_tsne = tsne.fit_transform(features)

In [84]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
topic_colors = [cluster_colors[index//11] for index, _ in enumerate(total_words)]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
                color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
                 color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)

You are attemptings to set `plot.legend.location` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with the 'legend' parameter set.

