## Import

In [2]:
import re
import jieba
import string 
from zhon.hanzi import punctuation as ch_punc
from string import punctuation as en_punc

In [3]:
def clean_string(str_processing, flag_en=False, flag_en_punc=False, flag_cn_punc=False, flag_num=False):
    # remove English character
    if flag_en:
        str_processing = re.sub('[a-zA-Z]','',str_processing)
    # remove English punctuation
    if flag_en_punc:
        str_processing = re.sub('[{}]'.format(en_punc),"",str_processing)
    # remove Chinese punctuation
    if flag_cn_punc:
        str_processing = re.sub('[{}]'.format(ch_punc),"",str_processing) 
    # remove Numeric char
    if flag_num:
        str_processing = re.sub('[\d]','',str_processing) # [0-9]
    return str_processing

## word segmentation with characters

In [4]:
string_one = '男性，36岁。咳痰1月余, 痰中有血丝,偶尔咯血100ml, X射线检查显示无异常, 考虑诊断为:(A)支气管扩张, (B)肺癌, (C)结核, (D)支气管肺炎, (E)肺水肿'
print("/".join(list(string_one)))

男/性/，/3/6/岁/。/咳/痰/1/月/余/,/ /痰/中/有/血/丝/,/偶/尔/咯/血/1/0/0/m/l/,/ /X/射/线/检/查/显/示/无/异/常/,/ /考/虑/诊/断/为/:/(/A/)/支/气/管/扩/张/,/ /(/B/)/肺/癌/,/ /(/C/)/结/核/,/ /(/D/)/支/气/管/肺/炎/,/ /(/E/)/肺/水/肿


## word segmentation with jieba

In [5]:
string_one = '男性，36岁。咳痰1月余, 痰中有血丝,偶尔咯血100ml, X射线检查显示无异常, 考虑诊断为:(A)支气管扩张, (B)肺癌, (C)结核, (D)支气管肺炎, (E)肺水肿'
print("/".join(list(jieba.cut(string_one))))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\hp\AppData\Local\Temp\jieba.cache
Loading model cost 0.532 seconds.
Prefix dict has been built successfully.


男性/，/36/岁/。/咳痰/1/月余/,/ /痰/中/有/血丝/,/偶尔/咯血/100ml/,/ /X射线/检查/显示/无/异常/,/ /考虑/诊断/为/:/(/A/)/支气管/扩张/,/ /(/B/)/肺癌/,/ /(/C/)/结核/,/ /(/D/)/支气管/肺炎/,/ /(/E/)/肺水肿


## word segmentation with jieba and medical dictionary

### Add medical dictionary

In [6]:
with open('../data/THUOCL_medical.txt', 'r', encoding='utf-8') as f:
    list_word_freq = f.readlines()
list_word_freq = [ i.strip().split() for i in list_word_freq]
list_word_freq = [ [word, int(freq)] for word, freq in list_word_freq]
for (word, freq) in list_word_freq:
    jieba.add_word(word.strip(),tag=freq)
    jieba.suggest_freq(word, tune=True)

### Test

In [7]:
string_one = '男性，36岁。咳痰1月余, 痰中有血丝,偶尔咯血100ml, X射线检查显示无异常, 考虑诊断为:(A)支气管扩张, (B)肺癌, (C)结核, (D)支气管肺炎, (E)肺水肿'
print("/".join(list(jieba.cut(string_one))))

男性/，/36/岁/。/咳痰/1/月余/,/ /痰/中/有/血丝/,/偶尔/咯血/100ml/,/ /X射线/检查/显示/无/异常/,/ /考虑/诊断/为/:/(/A/)/支气管扩张/,/ /(/B/)/肺癌/,/ /(/C/)/结核/,/ /(/D/)/支气管肺炎/,/ /(/E/)/肺水肿


In [8]:
string_one = '男性，36岁。咳痰1月余，痰中有血丝，偶尔咯血100ml，X射线检查无异常，考虑诊断为：'
print("/".join(list(jieba.cut(string_one))))

男性/，/36/岁/。/咳痰/1/月余/，/痰/中/有/血丝/，/偶尔/咯血/100ml/，/X射线/检查/无/异常/，/考虑/诊断/为/：


In [9]:
string_one = '(A)支气管扩张, (B)肺癌, (C)结核, (D)支气管肺炎, (E)肺水肿'
print("/".join(list(jieba.cut(string_one))))

(/A/)/支气管扩张/,/ /(/B/)/肺癌/,/ /(/C/)/结核/,/ /(/D/)/支气管肺炎/,/ /(/E/)/肺水肿


In [10]:
string_one = '男性，36岁。咳痰1月余, 痰中有血丝,偶尔咯血100ml, X线检查无异常, 考虑诊断为:(A)支气管扩张, (B)肺癌, (C)结核, (D)支气管肺炎, (E)肺水肿'
string_one = clean_string(string_one, flag_en=True, flag_en_punc=True, flag_cn_punc=True, flag_num=True)
print([ word for word in list(jieba.cut(string_one)) if len(word.strip())>0 ])

['男性', '岁', '咳痰', '月余', '痰', '中', '有', '血丝', '偶尔', '咯血', '线', '检查', '无', '异常', '考虑', '诊断', '为', '支气管扩张', '肺癌', '结核', '支气管肺炎', '肺水肿']


## End

In [11]:
print('Done.')

Done.
