In [1]:
import json
import re
import jieba

## 1. define function for JSON file operation

In [2]:
def open_json_file(CACHE_FNAME):
    try:
        cache_file = open(CACHE_FNAME, 'r', encoding='utf-8-sig')
        cache_contents = cache_file.read()
        CACHE_DICTION = json.loads(cache_contents, encoding='utf-8-sig')
        cache_file.close()
        return CACHE_DICTION

    except:
        CACHE_DICTION = {}
        return CACHE_DICTION
    
def dump_json_file(query_dict, file_name):
    dumped_json_cache = json.dumps(query_dict)
    fw = open(file_name,"w")
    fw.write(dumped_json_cache)
    fw.close()
    print('successfully write down the file: ', file_name)    

## 2. Define function for Data Cleaning

In [3]:
def isEnglish(s):  # 檢查字元是否為英文
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True


def string_clean(jobDescription):  # 字串清洗
    '''
    input: a string (original job description)
    output: a string (clean job description)
    '''
    job_desc = jobDescription.split('\r\n')  # 根據換行符號轉乘 List格式
    job_words = ''

    for words in job_desc:
        words = re.sub(r'[^\w\s]', ' ', words)  # remove all punctuations
        words = re.sub(r'\d+', '', words)  # remove all numbers
        words = words.strip()  # remove white space
        job_words += words

    return (job_words)

## 3. Define function for 104 Job data integration (職缺資料整合成字串)

In [4]:
def concate_clean_data(data):  # 104 職缺相關資料整合
    '''
    input: a dictionary of each job data (from origianl JSON format)
    output: a string (clean job description)
    '''
    jobDescription = data['jobDescription']
    jobOther = data['other']

    # clean job description
    job_words = string_clean(jobDescription) 
    job_words_other = string_clean(jobOther)  

    jobDescription_clean = job_words + job_words_other  # add two string

    # exclude all English job
    if isEnglish(jobDescription_clean) == True:
        return None
    
    # add jobName_clean
    jobDescription_clean = jobDescription_clean + " " + data['jobName_clean']
    
    # add jobCategory
    jobDescription_clean = jobDescription_clean + " " + data['jobCategory'].replace('／', ' ')

    # add major, language, skill, certificate and specialty data if exist

    def add_column_data(col_name, data, jobDescription_clean):
        # add column data if exist
        if len(data[col_name]) > 0:
            for j in data[col_name]:
                jobDescription_clean = jobDescription_clean + " " + j

    add_column_data('major_clean', data, jobDescription_clean)
    add_column_data('language_clean', data, jobDescription_clean)
    add_column_data('skill_clean', data, jobDescription_clean)
    add_column_data('certificate_clean', data, jobDescription_clean)
    add_column_data('specialty', data, jobDescription_clean)

    return jobDescription_clean


## 4. Use Jieba for tokenization 結巴分詞

In [5]:
def jieba_cut(data, stop_words): # 定義使用結巴程式
    '''
    input:
        data : string
        stop_words: list of stopwords
    output:
        lst_seg : list of words after jieba
    '''

    # 使用結巴斷詞，產生 list of words
    seg_result = jieba.cut(data, cut_all=False)

    # 篩選斷詞，去掉單一中文字
    lst_seg = []

    for i in list(seg_result):
        i = i.strip()
        if len(i) < 1:  # 排除空值
            continue
        elif isEnglish(i) == False and len(i) == 1:  # 排除單一中文字
            continue
        elif i.isdigit() == True:  # 排除數字
            continue
        elif i in stop_words:  # 排除stopwords
            continue

        else:
            lst_seg.append(i)

    return lst_seg

In [6]:
def job_data_after_jieba(job_data, stop_words):
    '''
    input:
        job_data: a dictionary of each job data (from origianl JSON format)
        stop_words: list of stopwords
    output: 
        str_result : a string after jieba (job content after clean and cut)
    '''

    data = concate_clean_data(job_data) # 職缺資料整合成字串
    
    if data != None:
        
        lst_seg = jieba_cut(data, stop_words) # 使用結巴斷詞
        str_result = ' '.join(lst_seg) # 將斷詞結果整合成空白分割字串
            
    else:
        str_result = ''

    return str_result


## 5. Run function 執行程式

In [7]:
# 1. read JSON data -------------------------------------

job_data = open_json_file('json_data/test_NLP_01.json')
print('所有資料數量: ', len(job_data))

所有資料數量:  135384


In [8]:
# 2. Clean job Description data, 使用 Jieba 分詞 -------------------------------------

jieba.load_userdict('jieba_data/Jobcontent_dict.txt')  # 指定辭典檔

# 指定 Stop words檔案
with open(file='jieba_data/Jobcontent_stopwords.txt', mode='r', encoding="UTF-8") as file:
    stop_words = file.read().split('\n')
    stop_words = [i.strip() for i in stop_words]


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BIGDAT~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.708 seconds.
Prefix dict has been built successfully.


In [9]:
# 將斷詞後結果存成 list of dictionary

lst_jobs_clean = []

for i in job_data:
    
    dict_job_clean = {}
    
    dict_job_clean['jobURL'] = i['jobURL']
    dict_job_clean['jobName'] = i['jobName']
    dict_job_clean['jobName_clean'] = i['jobName_clean']
    dict_job_clean['jobCategory'] = i['jobCategory']
    dict_job_clean['jobCat_main'] = i['jobCat_main']
    dict_job_clean['addressRegion_clean'] = i['addressRegion_clean']
    dict_job_clean['edu_clean'] = i['edu_clean']
    dict_job_clean['appearDate'] = i['appearDate']
    dict_job_clean['salary_clean'] = i['salary_clean']
    dict_job_clean['workExp_clean'] = i['workExp_clean']
    dict_job_clean['jobDescription_clean'] = job_data_after_jieba(i, stop_words)
    
    lst_jobs_clean.append(dict_job_clean)


## 6. 輸出JSON檔

In [10]:
dump_json_file(lst_jobs_clean, 'json_data/test_NLP_02.json')

successfully write down the file:  json_data/test_NLP_02.json
