## Import

In [1]:
import re
import json
import jieba
import numpy as np
from tqdm import tqdm
from thefuzz import fuzz
from rank_bm25 import BM25Okapi

In [3]:
with open('../../data/THUOCL_medical.txt', 'r', encoding='utf-8') as f:
    list_word_freq = f.readlines()
list_word_freq = [ i.strip().split() for i in list_word_freq]
list_word_freq = [ [word, int(freq)] for word, freq in list_word_freq]
for (word, freq) in list_word_freq:
    jieba.add_word(word.strip(),tag=freq)
    jieba.suggest_freq(word, tune=True)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\hp\AppData\Local\Temp\jieba.cache


Loading model cost 0.684 seconds.
Prefix dict has been built successfully.


## Function

In [4]:
def get_similar_idx(option, n_top, model):
    tokenized_option = list(jieba.cut(option))
    doc_scores = model.get_scores(tokenized_option)
    list_sample_idx = np.argsort(doc_scores, axis=0)[-n_top:][::-1]
    return list_sample_idx

In [5]:
def get_similar_sample(option, n_top, model, list_sample):
    tokenized_option = list(jieba.cut(option))
    doc_scores = model.get_scores(tokenized_option)
    list_sample_idx = np.argsort(doc_scores, axis=0)[-n_top:][::-1]
    list_sample_similar = [list_sample[idx] for idx in list_sample_idx]
    return list_sample_similar

## Loading 

### Loading Data

In [6]:
path_file_data = '../../data/MedQA/Mainland/test.jsonl'

In [7]:
list_dict_test = []
with open(path_file_data, 'r', encoding="utf-8") as f:
    for idx, line in enumerate(f):
        data = json.loads(line)
        data['ID'] = idx
        data['A'], data['B'], data['C'], data['D'], data['E'] = data['options']['A'], data['options']['B'], data['options']['C'], data['options']['D'], data['options']['E']
        del data['options']
        list_dict_test.append(data)

In [8]:
list_dict_test[0]

{'question': '经调查证实出现医院感染流行时，医院应报告当地卫生行政部门的时间是（\u3000\u3000）。',
 'answer': '24小时内',
 'meta_info': '卫生法规',
 'answer_idx': 'E',
 'ID': 0,
 'A': '2小时',
 'B': '4小时内',
 'C': '8小时内',
 'D': '12小时内',
 'E': '24小时内'}

### Loading Question Bank

In [9]:
list_path_file_qb = ['../../data/MedQA/Mainland/train.jsonl', '../../data/MedQA/Mainland/dev.jsonl']
# '../../data/MedQA/Mainland/chinese_qbank.jsonl', 

In [10]:
list_dict_sample_all = []
for path_file_qb in list_path_file_qb:
    with open(path_file_qb, 'r', encoding="utf-8") as f:
        for idx, line in enumerate(f):
            data = json.loads(line)
            data['ID'] = idx
            data['A'], data['B'], data['C'], data['D'], data['E'] = data['options']['A'], data['options']['B'], data['options']['C'], data['options']['D'], data['options']['E']
            del data['options']
            list_dict_sample_all.append(data)
print(f"Loading {len(list_dict_sample_all)} samples")

Loading 30825 samples


In [11]:
data

{'question': '男，25岁，背部刀伤，伤口流血2h，体查：神志尚清楚，诉口渴，皮肤苍白，稍冷，脉搏110/min，血压12/9.33kPA（90/70mmHg），脉压小，表浅静脉塌陷，尿少。1．此病人休克达何种程度？（\u3000\u3000）',
 'answer': '中度',
 'meta_info': '第二部分\u3000模拟试题',
 'answer_idx': 'A',
 'ID': 3424,
 'A': '中度',
 'B': '轻度',
 'C': '重度',
 'D': '晚期',
 'E': '代偿期'}

In [12]:
# write a function to remove the duplicate dict of list_sample_all and keep the order
def remove_duplicate_dict(list_sample_all):
    list_duplicate = []
    list_sample_all_new = [list_sample_all[0]]
    for dict_sample in tqdm(list_sample_all[1:]):
        flag_in = False
        str_option_sample =  dict_sample["A"] + ' ' + dict_sample["B"] + ' ' + dict_sample["C"] + ' ' + dict_sample["D"] + ' ' + dict_sample["E"]
        for dict_in in list_sample_all_new:
            str_option_in =  dict_in["A"] + ' ' + dict_in["B"] + ' ' + dict_in["C"] + ' ' + dict_in["D"] + ' ' + dict_in["E"]
            if fuzz.ratio(dict_in['question'], dict_sample['question'])>50 and fuzz.ratio(str_option_sample, str_option_in)>65:
                flag_in=True
                list_duplicate.append([dict_in['question'], str_option_in, dict_sample['question'], str_option_sample])
                break
        if not flag_in:
            list_sample_all_new.append(dict_sample)
    return list_sample_all_new, list_duplicate

In [13]:
print(f"The number of sample before removing duplicate: {len(list_dict_sample_all)}")
list_dict_sample_all, list_duplicate = remove_duplicate_dict(list_dict_sample_all)
print(f"The number of sample after removing duplicate: {len(list_dict_sample_all)}")

The number of sample before removing duplicate: 30825


100%|██████████| 30824/30824 [10:23<00:00, 49.41it/s]

The number of sample after removing duplicate: 30825





In [24]:
len(list_dict_sample_all)

21382

In [33]:
list_dict_sample = []
for idx, dict_sample_all in enumerate(tqdm(list_dict_sample_all)):
    question, answer = dict_sample_all['question'], dict_sample_all['answer']
    a, b, c, d, e = dict_sample_all['A'], dict_sample_all['B'], dict_sample_all['C'], dict_sample_all['D'], dict_sample_all['E']
    question = re.sub(r'\s*（\s*）。*$', '', question)
    question = re.sub(r'\s*\(\s*\)。*$', '', question)
    dict_sample={'ID':dict_sample_all['ID'], 'question': question, 'A': a, 'B': b, 'C': c, 'D': d, 'E': e, 'answer': answer}
    input = f"问题：{question}: (A){a}, (B){b}, (C){c}, (D){d}, (E){e}\n答案：{answer}\n"
    dict_sample['Input'] = input
    list_dict_sample.append(dict_sample)

100%|██████████| 21382/21382 [00:00<00:00, 145715.44it/s]


## Question Bank

### bm25_question

In [35]:
corpus_question = [ dict_one['question'] + ' ' + dict_one['A'] + ' ' + dict_one['B'] + ' ' + dict_one['C'] + ' ' + dict_one['D'] + ' ' + dict_one['E'] for dict_one in list_dict_sample ]
tokenized_corpus_question = [ list(jieba.cut(doc)) for doc in corpus_question]
bm25_question = BM25Okapi(tokenized_corpus_question)
print(f'We construct a Question Bank with {len(corpus_question)} questions')

We construct a Question Bank with 21382 questions


In [30]:
dict_test = list_dict_test[266]
query = ( dict_test["question"] + ' ' + dict_test["A"] + ' ' + dict_test["B"] + ' ' + dict_test["C"] + ' ' + dict_test["D"] + ' ' + dict_test["E"]
)
print(query)
tokenized_query = list(jieba.cut(query))
print(tokenized_query)

1．Ⅰ期淋巴瘤（　　）。 病变仅限于一个解剖部位 病变累及右侧颈、腋下和腹股沟淋巴结 病变累及左腋下淋巴结及肝脏 病变累及右锁骨上淋巴结和左腋下，并伴有高热 病变累及左颈及纵隔淋巴结
['1', '．', 'Ⅰ', '期', '淋巴瘤', '（', '\u3000', '\u3000', '）', '。', ' ', '病变', '仅限于', '一个', '解剖', '部位', ' ', '病变', '累及', '右侧', '颈', '、', '腋下', '和', '腹股沟淋巴结', ' ', '病变', '累及', '左', '腋下', '淋巴结', '及', '肝脏', ' ', '病变', '累及', '右', '锁骨上淋巴结', '和', '左', '腋下', '，', '并', '伴有', '高热', ' ', '病变', '累及', '左颈及', '纵隔', '淋巴结']


In [36]:
doc_scores = bm25_question.get_scores(tokenized_query)
for idx in np.argsort(doc_scores, axis=0)[-3:][::-1]:
    print(idx, "".join(tokenized_corpus_question[idx]))

296 3．Ⅲ期淋巴瘤 病变仅限于一个解剖部位 病变累及右侧颈、腋下和腹股沟淋巴结 病变累及左腋下淋巴结及肝 病变累及右锁骨上淋巴结和左腋下，并伴有高热 病变累及左颈及纵隔淋巴结
9714 属于子宫内膜癌ⅡA期的是 病变侵犯肌层＞1/2 累及宫颈黏膜腺体 癌累及阴道上1/3段 病变侵犯浆膜和（或）附件 侵犯宫颈间质
16680 女，22岁，左颈下及腋下出现无痛性肿块3月余，体检发现左侧颈部、锁骨上和腋窝等处，有肿大的孤立的无痛性肿大淋巴结，以下哪项发现最有助于提示此患者是霍奇金病，而非非霍奇金淋巴瘤？ 发病早期全身剧烈瘙痒 病变累及口咽和鼻咽部 硬膜外肿瘤压迫脊髓 伴发自身免疫性溶血性贫血 弥漫性组织细胞型淋巴瘤痛


In [37]:
list_sample_idx = get_similar_idx(query, n_top=3, model=bm25_question)
list_sample_idx

array([  296,  9714, 16680], dtype=int64)

In [38]:
list_similar_sample = get_similar_sample(query, n_top=3, model=bm25_question, list_sample=list_dict_sample)
list_similar_sample

[{'ID': 297,
  'question': '3．Ⅲ期淋巴瘤',
  'A': '病变仅限于一个解剖部位',
  'B': '病变累及右侧颈、腋下和腹股沟淋巴结',
  'C': '病变累及左腋下淋巴结及肝',
  'D': '病变累及右锁骨上淋巴结和左腋下，并伴有高热',
  'E': '病变累及左颈及纵隔淋巴结',
  'answer': '病变累及右侧颈、腋下和腹股沟淋巴结',
  'Input': '问题：3．Ⅲ期淋巴瘤: (A)病变仅限于一个解剖部位, (B)病变累及右侧颈、腋下和腹股沟淋巴结, (C)病变累及左腋下淋巴结及肝, (D)病变累及右锁骨上淋巴结和左腋下，并伴有高热, (E)病变累及左颈及纵隔淋巴结\n答案：病变累及右侧颈、腋下和腹股沟淋巴结\n'},
 {'ID': 11388,
  'question': '属于子宫内膜癌ⅡA期的是',
  'A': '病变侵犯肌层＞1/2',
  'B': '累及宫颈黏膜腺体',
  'C': '癌累及阴道上1/3段',
  'D': '病变侵犯浆膜和（或）附件',
  'E': '侵犯宫颈间质',
  'answer': '累及宫颈黏膜腺体',
  'Input': '问题：属于子宫内膜癌ⅡA期的是: (A)病变侵犯肌层＞1/2, (B)累及宫颈黏膜腺体, (C)癌累及阴道上1/3段, (D)病变侵犯浆膜和（或）附件, (E)侵犯宫颈间质\n答案：累及宫颈黏膜腺体\n'},
 {'ID': 22110,
  'question': '女，22岁，左颈下及腋下出现无痛性肿块3月余，体检发现左侧颈部、锁骨上和腋窝等处，有肿大的孤立的无痛性肿大淋巴结，以下哪项发现最有助于提示此患者是霍奇金病，而非非霍奇金淋巴瘤？',
  'A': '发病早期全身剧烈瘙痒',
  'B': '病变累及口咽和鼻咽部',
  'C': '硬膜外肿瘤压迫脊髓',
  'D': '伴发自身免疫性溶血性贫血',
  'E': '弥漫性组织细胞型淋巴瘤痛',
  'answer': '发病早期全身剧烈瘙痒',
  'Input': '问题：女，22岁，左颈下及腋下出现无痛性肿块3月余，体检发现左侧颈部、锁骨上和腋窝等处，有肿大的孤立的无痛性肿大淋巴结，以下哪项发现最有助于提示此患者是霍奇金病，而非非霍奇金淋巴瘤？: (A)发病早

## Search

In [39]:
num_similar = 10

In [42]:
for idx_testing, dict_test in enumerate(tqdm(list_dict_test[:50])):
    question_and_options = dict_test["question"] + ' ' + dict_test["A"] + ' ' + dict_test["B"] + ' ' + dict_test["C"] + ' ' + dict_test["D"] + ' ' + dict_test["E"]
    list_similar_sample = get_similar_sample(question_and_options, n_top=num_similar, model=bm25_question, list_sample=list_dict_sample)
    dict_test['list_similar_sample'] = list_similar_sample

100%|██████████| 50/50 [00:12<00:00,  4.11it/s]


## Save

In [None]:
with open(f'data_fewshot_enhancement.json', 'w', encoding="utf-8") as f:
    f.write(json.dumps(list_dict_test, indent=2, ensure_ascii=False))

## Read

In [24]:
num_similar = 10
num_sample = 10

In [25]:
with open(f'data_fewshot_enhancement.json', 'r') as f:
    list_dict_test_sample = json.load(f)

## End

In [None]:
print('Done.')