In [1]:
import pandas as pd
import os
from tqdm import tqdm

In [2]:
file_dir = 'Chinese_Medical_Dialogue_Data'  #你的文件路径
def getFlist(path):
    for root, dirs, files in os.walk(file_dir):
        print('root_dir:', root)  #当前路径
        print('sub_dirs:', dirs)   #子文件夹
        print('files:', files)     #文件名称，返回list类型
    return files
file_name = getFlist(file_dir)

root_dir: Chinese_Medical_Dialogue_Data
sub_dirs: []
files: ['ErKe-14000.xlsx', 'FuChanKe-28000.xlsx', 'NanKe-13000.xlsx', 'NeiKe-33000.xlsx', 'WaiKe-14000.xlsx', 'ZhongLiuKe-10000.xlsx']


In [3]:
file_path = [file_dir + '\\' + item for item in file_name]
file_path

['Chinese_Medical_Dialogue_Data\\ErKe-14000.xlsx',
 'Chinese_Medical_Dialogue_Data\\FuChanKe-28000.xlsx',
 'Chinese_Medical_Dialogue_Data\\NanKe-13000.xlsx',
 'Chinese_Medical_Dialogue_Data\\NeiKe-33000.xlsx',
 'Chinese_Medical_Dialogue_Data\\WaiKe-14000.xlsx',
 'Chinese_Medical_Dialogue_Data\\ZhongLiuKe-10000.xlsx']

In [4]:


df_all = []
for name in tqdm(file_path):
    df = pd.read_excel(name)
    df['file_name'] = name
    df_all.append(df)

100%|██████████| 6/6 [01:06<00:00, 11.05s/it]


In [5]:
def regulate_expression(x):
    x = x.replace('\n', '')
    if x[-1] == '吗' or x[-2:-1] == '原因' or (x[-1] != '？' and x[-1] != '?'):
        x = x + '？'
    return x


def replace_line(x):
    x = x.replace('\n', '')
    return x


ndf_all = []


for df in tqdm(df_all):
    # 过滤ask为‘无’
    find_index = df[df['ask'] == '无'].index.tolist()
    tdf = df.drop(find_index).dropna()
    tdf = tdf.reset_index(drop=True)
    
    # 过滤ask为非字符类型
    idx = []
    for i in range(len(tdf)):
        if type(tdf['ask'][i]) != type(tdf['ask'][0]):
            idx.append(i)
            # print(tdf['ask'][i])
    tdf.drop(tdf.index[idx], inplace=True)
    tdf = tdf.reset_index(drop=True)
    
    # 生成完整输入句子
    tdf['title'] = tdf['title'].apply(regulate_expression)
    tdf['ask'] = tdf['ask'].apply(replace_line)
    tdf['answer'] = tdf['answer'].apply(replace_line)
    tdf['sequence'] = tdf['title'] + tdf['ask'] + tdf['answer']
    
    # 过滤长度大于320的句子
    find_over_long_index = tdf.loc[tdf['sequence'].str.len() > 320].index.tolist()
    len(find_over_long_index) / len(tdf)
    tdf = tdf.drop(find_over_long_index).dropna().reset_index(drop=True)
    tdf.reset_index(drop=True)
    
    ndf_all.append(tdf)

100%|██████████| 6/6 [00:08<00:00,  1.48s/it]


In [6]:
min_num = 1e10
for df in tqdm(ndf_all):
    min_num = min(len(df), min_num)
    print(len(df))
min_num

100%|██████████| 6/6 [00:00<?, ?it/s]

80241
138586
83140
168853
92569
62542





62542

In [7]:
sdf_all = []
for df in tqdm(ndf_all):
    sdf = df.sample(n=min_num)
    print(len(sdf))
    sdf_all.append(sdf.reset_index(drop=True))

100%|██████████| 6/6 [00:00<00:00, 56.60it/s]

62542
62542
62542
62542
62542
62542





In [8]:
processed_path = "Preprocess_Data\\"
idx = 0
for df in tqdm(sdf_all):
    f_name = file_name[idx].split('-')[0]
    df.to_csv(processed_path + f_name + '.csv')
    idx += 1

100%|██████████| 6/6 [00:05<00:00,  1.20it/s]


In [9]:
train_num = 15000
valid_num = 2500
test_num = 2500

train_df_all = []
valid_df_all = []
test_df_all = []
for df in tqdm(sdf_all):
    train_df_all.append(df[0: train_num])
    valid_df_all.append(df[train_num: train_num + valid_num])
    test_df_all.append(df[train_num + valid_num: train_num + valid_num + test_num])

100%|██████████| 6/6 [00:00<00:00, 299.99it/s]


In [10]:
len(train_df_all[0]), len(valid_df_all[0]), len(test_df_all[0])

(15000, 2500, 2500)

In [11]:
train_df = pd.concat(train_df_all, ignore_index=True)
valid_df = pd.concat(valid_df_all, ignore_index=True)
test_df = pd.concat(test_df_all, ignore_index=True)
len(train_df), len(valid_df), len(test_df)

(90000, 15000, 15000)

In [12]:
train_corpus = train_df['sequence']
valid_corpus = valid_df['sequence']
test_corpus = test_df['sequence']

In [13]:
f_train = open("train.txt", "w", encoding='utf-8')
f_valid = open("valid.txt", "w", encoding='utf-8')
f_test = open("test.txt", "w", encoding='utf-8')

for item in train_corpus:
    f_train.write(item + '\n')
f_train.close()

for item in valid_corpus:
    f_valid.write(item + '\n')
f_valid.close()

for item in test_corpus:
    f_test.write(item + '\n')
f_test.close()

In [14]:
tiny_train_num = 1500
tiny_valid_num = 250
tiny_test_num = 250

tiny_train_df_all = []
tiny_valid_df_all = []
tiny_test_df_all = []
for df in tqdm(sdf_all):
    tiny_train_df_all.append(df[0: tiny_train_num])
    tiny_valid_df_all.append(df[tiny_train_num: tiny_train_num + tiny_valid_num])
    tiny_test_df_all.append(df[tiny_train_num + tiny_valid_num: tiny_train_num + tiny_valid_num + tiny_test_num])

100%|██████████| 6/6 [00:00<00:00, 6000.43it/s]


In [15]:
len(tiny_train_df_all[0]), len(tiny_valid_df_all[0]), len(tiny_test_df_all[0])

(1500, 250, 250)

In [16]:
tiny_train_df = pd.concat(tiny_train_df_all, ignore_index=True)
tiny_valid_df = pd.concat(tiny_valid_df_all, ignore_index=True)
tiny_test_df = pd.concat(tiny_test_df_all, ignore_index=True)
len(tiny_train_df), len(tiny_valid_df), len(tiny_test_df)

(9000, 1500, 1500)

In [17]:
tiny_train_corpus = tiny_train_df['sequence']
tiny_valid_corpus = tiny_valid_df['sequence']
tiny_test_corpus = tiny_test_df['sequence']

In [18]:
f_train = open("tiny_train.txt", "w", encoding='utf-8')
f_valid = open("tiny_valid.txt", "w", encoding='utf-8')
f_test = open("tiny_test.txt", "w", encoding='utf-8')

for item in tiny_train_corpus:
    f_train.write(item + '\n')
f_train.close()

for item in tiny_valid_corpus:
    f_valid.write(item + '\n')
f_valid.close()

for item in tiny_test_corpus:
    f_test.write(item + '\n')
f_test.close()