HealthCareMagic 数据集的第一轮简单清洗：去除一些问候语，并筛除过短的问/答

In [1]:
import re
def remove_polite_clauses(text): #remove some polite words/irrelavant and the clauses that contain them
    polite_words = r'\b(hi|hello|thanks|thankyou|thank you|welcome|sir|dear)\b'
    punctuations = r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：“”‘’（）《》〈〉【】『』—……·～]'

    pattern = re.compile(rf'({polite_words})(.*?)({punctuations})', flags=re.IGNORECASE)
    
    cleaned = re.sub(pattern, ' ', text)
    return re.sub(r'\s+', ' ', cleaned).strip()


In [2]:
import pandas as pd
HCM_raw = pd.read_parquet('data\\train-00000-of-00001-5e7cb295b9cff0bf.parquet')

In [None]:
HCM_raw.head(10)

In [None]:
HCM_raw = HCM_raw[["input","output"]]
HCM_raw.head(10)

In [None]:
HCM = HCM_raw.map(lambda x: remove_polite_clauses(x))
HCM.head(10)

In [6]:
HCM_filtered = HCM[(HCM["input"].str.len()>150) & (HCM["output"].str.len()>150)]
print(HCM.shape)
print(HCM_filtered.shape)

(112165, 2)
(104426, 2)


HealthCareMagic 数据集的第二轮针对问答质量的筛选：检查在问题和回答中是否分别有至少一个医学相关名词

In [7]:
#词汇表来自 https://github.com/glutanimate/wordlist-medicalterms-en
with open('medical_vocab\wordlist.txt', 'r') as f:
    terms = f.readlines()
terms = [t.strip() for t in terms]
len(terms)

98119

In [8]:
terms_set = set(terms)
def has_medical_term(text):
    tokens = text.split()
    tokens = [token.strip(r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~，。！？、；：“”‘’（）《》〈〉【】『』—……·～]')
              for token in tokens]
    return any(token in terms_set for token in tokens)
# head = HCM_filtered.head(10000)
# test = head[head['output'].apply(has_medical_term)]
# test = head[head['input'].apply(has_medical_term)]
# len(test)

In [9]:
HCM_cleaned = HCM_filtered[(HCM_filtered['output'].apply(has_medical_term)) & (HCM_filtered['input'].apply(has_medical_term))]
HCM_cleaned.shape

(104146, 2)

接下来把处理好的 HCM 数据最终调整成训练所需并且能够用 data loader 读取的格式，然后存储。

In [15]:
HCM_cleaned.columns

Index(['input', 'output'], dtype='object')

In [20]:
HCM_cleaned.rename(columns={"input":"question","output":"answer"}, inplace=True)
HCM_cleaned.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HCM_cleaned.rename(columns={"input":"question","output":"answer"}, inplace=True)


Unnamed: 0,question,answer
0,I woke up this morning feeling the whole room ...,The most likely cause for your symptoms is ben...
1,My baby has been pooing 5-6 times a day for a ...,.. It seems your kid is having viral diarrhea....
2,My husband is taking Oxycodone due to a broken...,"and I hope I can help you today.First, there i..."


In [33]:
#为了之后的 multi-turn 数据，统一格式为 {'dialogue_id':hcmxxxxxxx,'turns':[{'role':..., 'utterance':...},{...}]}
HCM_cleaned['dialogue_id'] = 'hcm' + HCM_cleaned.index.astype(str).str.zfill(7)

In [None]:
HCM_cleaned['turns'] = HCM_cleaned.apply(lambda x:[{'role':'patient', 'utterance': x['question']},{'role':'doctor', 'utterance': x['answer']}], axis=1)

In [34]:
HCM_cleaned = HCM_cleaned[['dialogue_id', 'turns']]
HCM_cleaned.tail(3)

Unnamed: 0,dialogue_id,turns
112162,hcm0112162,"[{'role': 'patient', 'utterance': 'My toes on ..."
112163,hcm0112163,"[{'role': 'patient', 'utterance': 'I was diagn..."
112164,hcm0112164,"[{'role': 'patient', 'utterance': 'Within the ..."


In [32]:
HCM_cleaned.to_json('data/hcm_en.jsonl', orient='records', lines=True, force_ascii=False) #存储为 jsonl 格式，每行一个字典（orient=‘records’）

处理 meddialog-zh 中文数据

读取数据，区分单回合和多回合数据，把单回合数据放入 dataframe （比较好处理）。
多回合数据暂时待处理

In [None]:
import json
# 读取
with open('data\meddia_raw\\test_data.json', 'r') as f:
    meddia_raw = json.load(f)
with open('data\meddia_raw\\train_data.json', 'r') as f:
    meddia_raw.append(json.load(f))
with open('data\meddia_raw\\validate_data.json', 'r') as f:
    meddia_raw.append(json.load(f))

In [249]:
#分开 single- 和 multi-turn
meddia_single = [d for d in meddia_raw if len(d)<=2]
meddia_multi = [d for d in meddia_raw if len(d)>2]
print(len(meddia_raw))
print(len(meddia_single))
print(len(meddia_multi))

340757
221615
119142


In [250]:
#把 single-turn 的做成两列的 dataframe
temp_q, temp_a = [d[0] for d in meddia_single], [d[1] for d in meddia_single]
meddia_single = pd.DataFrame({'question':temp_q,'answer':temp_a})
meddia_single.shape

(221615, 2)

修改内容：
1. 删除包含以下特征的短句（并非完整句，而是用标点符号区分）：礼貌用语、涉及预约挂号的词汇、多于 7 个数字的短句（删除一部分电话号、身份证号等信息）
2. 将指代具体医疗机构的名词都替换成 “医院”

In [246]:
#中文分句函数原版来自 https://www.cnblogs.com/ting1/p/16833884.html
#修改为分短句而不是长句
import re
def cut_sent(para):
    punctuations = r'[!"#$%&\'()*+,\-./::;<=>?@[\\\]^_`{|}~，。！？、；：“”‘’（）《》〈〉【】『』—……·～]'
    para = re.sub(rf'({punctuations})([^”’])', r"\1\n\2", para)
    
    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
    para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
    # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
    para = para.rstrip()  # 段尾如果有多余的\n就去掉它
    # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
    return para.split("\n")

In [265]:
#删除口语词、常见的不相关词，部分涉及具体预约和挂号的词以及包含他们的短句。
def remove_polite_clauses_zh(text):
    polite_words = re.compile(r'(感谢|预约|挂号|本科室|我科室|办公室|地址|你好|您好|谢谢|对不起|原谅|上传|不客气|抱歉|不好意思|再见|订单|不谢|大夫好|明白了|没问题|提供|早日康复|不用担心|放心吧|电话|嗨|哈喽|哈哈|呵呵|这样啊)')

    clauses = cut_sent(text)
    # print(clauses)
    clauses = [c for c in clauses if not polite_words.search(c)]  #检查短句内是否有至少一个 polite word，如没有就保留，有就删除
    # print(clauses)
    return ''.join(clauses)

In [None]:
# 简单测试
# print(remove_polite_clauses_zh("!这，电饭锅！！不用担心哦哦！啊啊谢谢啊?asd''哦呵大夫呵aaa"))

['!', '这，', '电饭锅！', '！不用担心哦哦！', '啊啊谢谢啊?', "asd'", "'哦呵大夫呵aaa"]
['!', '这，', '电饭锅！', "asd'", "'哦呵大夫呵aaa"]
!这，电饭锅！asd''哦呵大夫呵aaa


In [None]:
#处理 single-turn
meddia_single['question'] = meddia_single['question'].apply(remove_polite_clauses_zh)
meddia_single['answer'] = meddia_single['answer'].apply(remove_polite_clauses_zh)
meddia_single.head(10)

In [None]:
#处理 multi-turn

In [None]:
#把 “我院” 、“本院” 之类的具体指代替换成 “医院”：对模型来说一般不存在“我院”
#可以保留建议就诊的回答（如果它通过了其他的质量清洗），因为模型也需要在合适的时候给出建议就诊的回答。
specific_hospitals = r'我院|本院|贵院|该院|咱院|本中心|我中心|贵中心|咱中心|本机构|我机构|贵机构|该机构|咱机构|本诊所|贵诊所|我诊所|该诊所|咱诊所'
#single-turn
meddia_single['question'] = meddia_single['question'].str.replace(specific_hospitals, '医院', regex=True)
meddia_single['answer'] = meddia_single['answer'].str.replace(specific_hospitals, '医院', regex=True)

In [None]:
#multi-turn

In [258]:
#删除包含多于 7 个 digit 或者不包含中文或英文字符的短句，因为很有可能是电话号
def has_more_than_7_digits(text):
    digits = re.findall(r'\d', text)  # 匹配所有数字字符
    return len(digits) > 7

def has_chn_eng_chars(text):
    return bool(re.search(r'[\u4e00-\u9fffA-Za-z]', text))

def remove_phone_numbers(text):
    clauses = cut_sent(text)
    clauses = [c for c in clauses if (not has_more_than_7_digits(c)) and has_chn_eng_chars(c)] 

    return ''.join(clauses)

#single-turn
meddia_single['question'] = meddia_single['question'].apply(remove_polite_clauses_zh)
meddia_single['answer'] = meddia_single['answer'].apply(remove_polite_clauses_zh)

In [None]:
#multi-turn

筛除数据：
1. 卡长度：问题 20 中文字以上，回答 30 中文字以上 （single-turn）
2. 筛除可能包含人名、邮箱、网址、地址、电话号码、各种编号或识别码的问答对：
    a. 模糊识别并删除包含邮箱或网址的问答对（因为格式不一定标准，所以很难删干净，干脆删除整个问答对）
    b. 通过“号”“码”“址”模糊识别包含各种编号、识别码、地址或网址的问答对。
    c. 匹配并删除包含全国省市名称的问答对
    d. 匹配并删除包含“百家姓+0~3 个汉字+各种医生称谓”的问答对

In [270]:
#卡长度：问题 20 个中文字以上，回答 30 个中文字以上
def count_chinese_chars(text):
    return len(re.findall(r'[\u4e00-\u9fff]', text))

#single-turn
print(meddia_single.shape)
meddia_single_filtered = meddia_single[(meddia_single['question'].apply(count_chinese_chars) > 20) & (meddia_single['answer'].apply(count_chinese_chars) > 30)]
print(meddia_single_filtered.shape)

(221615, 2)
(68297, 2)


In [None]:
#multi-turn

In [None]:
#识别包含网址或邮箱的文字（模糊识别，不要求精确格式）
def contains_sites_or_emails(text):
    pattern = re.compile(r'(www|http|com|cn|edu|org|net|gov|@)')
    return bool(pattern.search(text))

#删除包含网址或邮箱的问答对

#single-turn
meddia_single_filtered = meddia_single_filtered[(~meddia_single['question'].apply(contains_sites_or_emails)) & (~meddia_single['answer'].apply(contains_sites_or_emails))]
meddia_single_filtered.shape

In [None]:
#multi-turn

In [279]:
#删除包含“号”、“码”字的问答对
#虽然可能误识，但可以扫除大部分包含各种识别号的问答对
#同理，同时删除包含“址”的问答对
def contains_codes(text):
    return bool(re.compile(r'[号|码|址]').search(text))

#single-turn
meddia_single_filtered = meddia_single_filtered[(~meddia_single['question'].apply(contains_codes)) & (~meddia_single['answer'].apply(contains_codes))]
meddia_single_filtered.shape

  meddia_single_filtered = meddia_single_filtered[(~meddia_single['question'].apply(contains_codes)) & (~meddia_single['answer'].apply(contains_codes))]


(61761, 2)

In [None]:
#multi-turn

In [296]:
#简易并且保守地识别人名、地名：人名使用百家姓+中间零到三个字+医生/教授/老师/大夫/主任/副主任....，地名使用中国城市名列表
baijiaxing = ["王", "李", "张", "刘", "陈", "杨", "赵", "黄", "周", "吴","徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗","梁", "宋", "郑", "谢", "韩", "唐", "冯", "于", "董", "萧","程", "曹", "袁", "邓", "许", "傅", "沈", "曾", "彭", "吕",
    "苏", "卢", "蒋", "蔡", "贾", "丁", "魏", "薛", "叶", "阎","余", "潘", "杜", "戴", "夏", "钟", "汪", "田", "任", "姜","范", "方", "石", "姚", "谭", "廖", "邹", "熊", "金", "陆","郝", "孔", "白", "崔", "康", "毛", "邱", "秦", "江", "史",
    "顾", "侯", "邵", "孟", "龙", "万", "段", "章", "钱", "汤","尹", "黎", "易", "常", "武", "乔", "贺", "赖", "龚", "文"]

doctor_titles = ["某某某","xxx","某某","某","x","xx","医生", "医师", "大夫", "教授", "主任", "副主任", "博导", "博士","院长", "副院长", "主治医师", "住院医师", "实习医生", "专家", "讲师", "助教", "导师", "老师", "护士长", "技师", "治疗师", "师傅"]

def contains_person_names(text):
    surname_pattern = f"({'|'.join(baijiaxing)})"
    title_pattern = f"({'|'.join(doctor_titles)})"

    # 匹配：姓 + 0~3个汉字 + 职称
    pattern = re.compile(rf"{surname_pattern}[\u4e00-\u9fa5]{{0,3}}{title_pattern}")
    
    return bool(pattern.search(text))

In [None]:
# 删除含有医生名字的问答对

# single-turn
meddia_single_filtered = meddia_single_filtered[(~meddia_single_filtered['answer'].apply(contains_person_names))
                                                & (~meddia_single_filtered['question'].apply(contains_person_names))]
meddia_single_filtered.shape

(57885, 2)

In [None]:
#multi-turn

In [300]:
chn_cities = ["北京", "天津", "上海", "重庆","河北", "山西", "辽宁", "吉林", "黑龙江","江苏", "浙江", "安徽", "福建", "江西", "山东","河南", "湖北", "湖南", "广东", "海南","四川", "贵州", "云南", "陕西", "甘肃", "青海","内蒙古", "广西", "西藏", "宁夏", "新疆"
              "石家庄", "唐山", "秦皇岛", "邯郸", "邢台", "保定", "张家口", "承德", "沧州", "廊坊", "衡水", "太原", "大同", "阳泉", "长治", "晋城", "朔州", "晋中", "运城", "忻州", "临汾", "吕梁", "沈阳", "大连", "鞍山", "抚顺", "本溪", "丹东", "锦州", "营口", "阜新", "辽阳", "盘锦", "铁岭", "朝阳", "葫芦岛",
              "乌鲁木齐", "克拉玛依", "吐鲁番", "哈密", "昌吉", "博尔塔拉", "巴音郭楞", "阿克苏", "克孜勒苏", "喀什", "和田", "伊犁", "塔城", "阿勒泰", "石河子", "阿拉尔", "图木舒克", "五家渠", "北屯", "铁门关", "双河", "可克达拉", "昆玉", "胡杨河",
              "银川", "石嘴山", "吴忠", "固原", "中卫", "拉萨", "日喀则", "昌都", "林芝", "山南", "那曲", "阿里", "呼和浩特", "包头", "乌海", "赤峰", "通辽", "鄂尔多斯", "呼伦贝尔", "巴彦淖尔", "乌兰察布", "兴安", "锡林郭勒", "阿拉善",
              "南宁", "柳州", "桂林", "梧州", "北海", "防城港", "钦州", "贵港", "玉林", "百色", "贺州", "河池", "来宾", "崇左", "西宁", "海东", "海北", "黄南", "海南", "果洛", "玉树", "海西", "昆明", "曲靖", "玉溪", "保山", "昭通", "丽江", "普洱", 
              "临沧", "楚雄", "红河", "文山", "西双版纳", "大理", "德宏", "怒江", "迪庆", "西安", "铜川", "宝鸡", "咸阳", "渭南", "延安", "汉中", "榆林", "安康", "商洛", "兰州", "嘉峪关", "金昌", "白银", "天水", "武威", "张掖", "平凉", "酒泉", "庆阳", "定西", "陇南", "临夏", "甘南",
              "成都", "自贡", "攀枝花", "泸州", "德阳", "绵阳", "广元", "遂宁", "内江", "乐山", "南充", "眉山", "宜宾", "广安", "达州", "雅安", "巴中", "资阳", "阿坝", "甘孜", "凉山", "贵阳", "六盘水", "遵义", "安顺", "毕节", "铜仁", "黔西南", "黔东南", "黔南", "广州", "深圳", "珠海", "汕头", "韶关", "佛山", "江门", "湛江", "茂名", "肇庆", "惠州", "梅州", "汕尾", "河源", "阳江", "清远", "东莞", "中山", "潮州", "揭阳", "云浮", "海口", "三亚", "三沙", "儋州",
              "郑州", "开封", "洛阳", "平顶山", "安阳", "鹤壁", "新乡", "焦作", "濮阳", "许昌", "漯河", "三门峡", "南阳", "商丘", "信阳", "周口", "驻马店", "济源", "武汉", "黄石", "十堰", "宜昌", "襄阳", "鄂州", "荆门", "孝感", "荆州", "黄冈", "咸宁", "随州", "恩施", "长沙", "株洲", "湘潭", "衡阳", "邵阳", "岳阳", "常德", "张家界", "益阳", "郴州", "永州", "怀化", "娄底", "湘西",
              "福州", "厦门", "莆田", "三明", "泉州", "漳州", "南平", "龙岩", "宁德", "南昌", "景德镇", "萍乡", "九江", "新余", "鹰潭", "赣州", "吉安", "宜春", "抚州", "上饶", "济南", "青岛", "淄博", "枣庄", "东营", "烟台", "潍坊", "济宁", "泰安", "威海", "日照", "滨州", "德州", "聊城", "临沂", "菏泽",
              "南京", "无锡", "徐州", "常州", "苏州", "南通", "连云港", "淮安", "盐城", "扬州", "镇江", "泰州", "宿迁", "杭州", "宁波", "温州", "嘉兴", "湖州", "绍兴", "金华", "衢州", "舟山", "台州", "丽水""合肥", "芜湖", "蚌埠", "淮南", "马鞍山", "淮北", "铜陵", "安庆", "黄山", "滁州", "阜阳", "宿州", "六安", "亳州", "池州", "宣城","长春", "吉林", "四平", "辽源", "通化", "白山", "松原", "白城", "延边", "哈尔滨", "齐齐哈尔", "牡丹江", "佳木斯", "大庆", "伊春", "鸡西", "鹤岗", "双鸭山", "七台河", "黑河", "绥化", "大兴安岭"
            ]
# print(len(chn_cities))
def contains_addresses(text):
    city_pattern = re.compile(rf"({'|'.join(chn_cities)})")
    return bool(city_pattern.search(text))

In [None]:
# 删除含有省级或市级地区名的问答对

# single-turn
meddia_single_filtered = meddia_single_filtered[(~meddia_single_filtered['answer'].apply(contains_addresses))
                                                & (~meddia_single_filtered['question'].apply(contains_addresses))]
meddia_single_filtered.shape

(50142, 2)

In [None]:
#multi-turn

质量清洗：用回答内包含的医学名词的个数代表质量，删除回答内包含零个医学名词的问答对，记录名词个数信息并按照名词个数降序排序。

In [328]:
#识别包含医学名词的文字并返回个数
#词汇表来自 https://github.com/chun19920827/medlist
import regex

with open('medical_vocab\word_list.txt', 'r') as f:
    med_words_ch = f.readlines()

med_words_ch = [w.strip() for w in med_words_ch]

#对于列表中所有包含标点符号的名词，添加一个没有标点符号的版本
#escape 列表中的所有 string
temp = []
for s in med_words_ch:
    temp.append(re.escape(s))
    if regex.search(r'\p{P}', s):
        temp.append(re.escape(regex.sub(r'\p{P}+', '', s)))
med_words_ch = [w for w in temp if len(w)>0]

def count_med_words_ch(text):
    terms_pattern = re.compile(rf"({'|'.join(med_words_ch)})")
    # 保存非重叠最长匹配
    matches = []
    last_end = -1

    for match in terms_pattern.finditer(text):
        start, end = match.span()
        # 仅当此匹配不与上一个匹配重叠时保留
        if start >= last_end:
            matches.append(match.group())
            last_end = end  # 更新上一个匹配的结束位置

    return len(matches)

In [None]:
# single-turn
meddia_single_filtered['med_count'] = meddia_single_filtered['answer'].apply(count_med_words_ch)

# 过滤掉没有医学词的行，并按匹配数量降序排序
meddia_single_filtered = meddia_single_filtered[meddia_single_filtered['med_count'] > 0].sort_values(by='med_count', ascending=False).reset_index(drop=True)
meddia_single_filtered.shape

(46378, 3)

In [None]:
# multi-turn

In [None]:
#最终整理格式并存储

#single-turn
meddia_single_cleaned = pd.DataFrame()
meddia_single_cleaned['dialogue_id'] = 'dia' + meddia_single_filtered.index.astype(str).str.zfill(7)
meddia_single_cleaned['med_terms_count'] = meddia_single_filtered['med_count']
meddia_single_cleaned['turns'] = meddia_single_filtered.apply(lambda x:[{'role':'patient', 'utterance': x['question'][3:]},{'role':'doctor', 'utterance': x['answer'][3:]}], axis=1)
# meddia_single_cleaned.head(3)['turns']

In [336]:
meddia_single_cleaned.to_json('data/meddia_ch_single.jsonl', orient='records', lines=True, force_ascii=False)

In [None]:
#multi-turns

下面是在检查英文数据里的医学名词阶段，已经废除的方法：用 CHV vocab，包含一些医学名词的口语化表达和包含多个词的短语，然后再使用 spacy phrase matcher 检查回答内是否包含医学名词。
由于 CHV vocab 最终包含过多日常化和口语化的名词，而 spacy phrase matcher 又不够精确所以废除。

In [None]:
# import pandas as pd
# chv_vocab = pd.read_csv('CHV_vocab\CHV_concepts_terms_flatfile_20110204.tsv',sep='\t')
# chv_vocab.shape

In [None]:
# chv_vocab = chv_vocab[chv_vocab.columns[2]]
# chv_vocab.tail(10)

In [None]:
# chv_vocab_set = chv_vocab.to_list()
# chv_vocab_set = {v for v in chv_vocab_set if isinstance(v, str) and ',' not in v and '/' not in v}
# len(chv_vocab_set)

In [None]:
# import spacy
# from spacy.matcher import PhraseMatcher

# nlp = spacy.blank("en")
# matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# # chv_vocab_set = {v for v in chv_vocab_set if v not in nlp.Defaults.stop_words}
# # terms = list(chv_vocab_set)
# patterns = [nlp.make_doc(text) for text in terms]
# matcher.add("MED_TERM", patterns)


In [None]:
# def has_medical_term(text):
#     doc = nlp(text)
#     matches = matcher(doc)
#     #利用 spacy 的实体标签来过滤 chv 词汇库里的部分非医学专有名词
#     valid_matches = []
    
#     for match_id, start, end in matches:
#         span = doc[start:end]
        
#         if not any(ent.label_ in ["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"]
#                    for ent in doc.ents[start:end]):
#             valid_matches.append((match_id, start, end))
    
#     return len(valid_matches) > 0
#     # return len(matches) >0