HealthCareMagic 数据集的第一轮简单清洗：去除一些问候语，并筛除过短的问/答

In [1]:
import re
def remove_polite_clauses(text): #remove some polite words/irrelavant and the clauses that contain them
    polite_words = r'\b(hi|hello|thanks|thankyou|thank you|welcome|sir|dear)\b'
    punctuations = r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：“”‘’（）《》〈〉【】『』—……·～]'

    pattern = re.compile(rf'({polite_words})(.*?)({punctuations})', flags=re.IGNORECASE)
    
    cleaned = re.sub(pattern, ' ', text)
    return re.sub(r'\s+', ' ', cleaned).strip()


In [2]:
import pandas as pd
HCM_raw = pd.read_parquet('data\\train-00000-of-00001-5e7cb295b9cff0bf.parquet')

In [None]:
HCM_raw.head(10)

In [None]:
HCM_raw = HCM_raw[["input","output"]]
HCM_raw.head(10)

In [None]:
HCM = HCM_raw.map(lambda x: remove_polite_clauses(x))
HCM.head(10)

In [6]:
HCM_filtered = HCM[(HCM["input"].str.len()>150) & (HCM["output"].str.len()>150)]
print(HCM.shape)
print(HCM_filtered.shape)

(112165, 2)
(104426, 2)


HealthCareMagic 数据集的第二轮针对问答质量的筛选：检查在问题和回答中是否分别有至少一个医学相关名词

In [7]:
#词汇表来自 https://github.com/glutanimate/wordlist-medicalterms-en
with open('medical_vocab\wordlist.txt', 'r') as f:
    terms = f.readlines()
terms = [t.strip() for t in terms]
len(terms)

98119

In [8]:
terms_set = set(terms)
def has_medical_term(text):
    tokens = text.split()
    tokens = [token.strip(r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：“”‘’（）《》〈〉【】『』—……·～]')
              for token in tokens]
    return any(token in terms_set for token in tokens)
# head = HCM_filtered.head(10000)
# test = head[head['output'].apply(has_medical_term)]
# test = head[head['input'].apply(has_medical_term)]
# len(test)

In [9]:
HCM_cleaned = HCM_filtered[(HCM_filtered['output'].apply(has_medical_term)) & (HCM_filtered['input'].apply(has_medical_term))]
HCM_cleaned.shape

(104146, 2)

接下来把处理好的 HCM 数据最终调整成训练所需并且能够用 data loader 读取的格式，然后存储。

In [15]:
HCM_cleaned.columns

Index(['input', 'output'], dtype='object')

In [20]:
HCM_cleaned.rename(columns={"input":"question","output":"answer"}, inplace=True)
HCM_cleaned.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HCM_cleaned.rename(columns={"input":"question","output":"answer"}, inplace=True)


Unnamed: 0,question,answer
0,I woke up this morning feeling the whole room ...,The most likely cause for your symptoms is ben...
1,My baby has been pooing 5-6 times a day for a ...,.. It seems your kid is having viral diarrhea....
2,My husband is taking Oxycodone due to a broken...,"and I hope I can help you today.First, there i..."


In [21]:
HCM_cleaned.to_json('data/hcm_en.jsonl', orient='records', lines=True, force_ascii=False) #存储为 jsonl 格式，每行一个字典（orient=‘records’）

处理 meddialog-zh 中文数据

In [None]:
#中文版
def contains_medical_terms_zh(text, medical_terms):
    tokens = jieba.lcut(text)
    return any(token in medical_terms for token in tokens)

下面是在检查英文数据里的医学名词阶段，已经废除的方法：用 CHV vocab，包含一些医学名词的口语化表达和包含多个词的短语，然后再使用 spacy phrase matcher 检查回答内是否包含医学名词。
由于 CHV vocab 最终包含过多日常化和口语化的名词，而 spacy phrase matcher 又不够精确所以废除。

In [None]:
# import pandas as pd
# chv_vocab = pd.read_csv('CHV_vocab\CHV_concepts_terms_flatfile_20110204.tsv',sep='\t')
# chv_vocab.shape

In [None]:
# chv_vocab = chv_vocab[chv_vocab.columns[2]]
# chv_vocab.tail(10)

In [None]:
# chv_vocab_set = chv_vocab.to_list()
# chv_vocab_set = {v for v in chv_vocab_set if isinstance(v, str) and ',' not in v and '/' not in v}
# len(chv_vocab_set)

In [None]:
# import spacy
# from spacy.matcher import PhraseMatcher

# nlp = spacy.blank("en")
# matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# # chv_vocab_set = {v for v in chv_vocab_set if v not in nlp.Defaults.stop_words}
# # terms = list(chv_vocab_set)
# patterns = [nlp.make_doc(text) for text in terms]
# matcher.add("MED_TERM", patterns)


In [None]:
# def has_medical_term(text):
#     doc = nlp(text)
#     matches = matcher(doc)
#     #利用 spacy 的实体标签来过滤 chv 词汇库里的部分非医学专有名词
#     valid_matches = []
    
#     for match_id, start, end in matches:
#         span = doc[start:end]
        
#         if not any(ent.label_ in ["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"]
#                    for ent in doc.ents[start:end]):
#             valid_matches.append((match_id, start, end))
    
#     return len(valid_matches) > 0
#     # return len(matches) >0