In [264]:
import os
import shutil
import jieba
import random
import imblearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier

## Data Preparation

In [None]:
path = 'data/'
filelist = os.listdir(path)

In [250]:
def trancate_txt(srgname,trgname):
    with open(srgname,'r',encoding = 'UTF-8-sig') as srg:
        trg = open(trgname,'w',encoding = 'UTF-8-sig')
        lines = srg.readlines()
        length = len(lines)
        tag = 0
        for i in range(length):
            line = lines[i].replace(' ','').replace('\u3000','')
            if line == '\n':
                continue
            if '主诉' in line:
                tag = 1
            if tag == 0:
                continue
            if ('既往史' in line or '过敏史' in line or 
                '个人史' in line or '婚育史' in line or 
                '家族史' in line):
                continue
            if '体格检查' in line:
                trg.write(line)
                line1 = lines[i+1].replace(' ','').replace('\u3000','')
                line2 = lines[i+2].replace(' ','').replace('\u3000','')
                if len(line1)<100:
                    trg.write(line1)
                    trg.write(line2)
                elif len(line1)>100:
                    trg.write(line1)
                break
            else:
                trg.write(line)
        trg.close()

In [None]:
for filename in filelist:
    trancate_txt(path+filename,'data_trancated/'+filename)

In [None]:
for filename in os.listdir('data_trancated/'):
    src = open('data_trancated/'+filename,'r',encoding = 'UTF-8-sig')
    trg = open('symptoms/'+filename,'w',encoding = 'UTF-8-sig')
    trg.write(src.readline())
    trg.write(src.readline())
    src.close()
    trg.close()

In [2]:
punctuation = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ⊙∩～~`!@#$%^&*()_-+={}[]|\:;\"\'<>＜＞≪≫,.?/~·•！@#￥%……&*（）\\\
——-—+=｛【】｝』『「」〖〗|、：；“”‘’《》，。？、/*-+.\t\r\n'
trans = str.maketrans(punctuation,' '*len(punctuation))

In [3]:
filenamelist = os.listdir('symptoms/')

In [4]:
%%time
data = []
for name in filenamelist:
    file = open('symptoms/'+name,'r',encoding = 'UTF-8-sig')
    content = file.read()
    file.close()
    content = content.translate(trans)
    content = jieba.lcut(content)
    data.append(' '.join(content).split())

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Icarus\AppData\Local\Temp\jieba.cache
Loading model cost 0.704 seconds.
Prefix dict has been built succesfully.


Wall time: 3.35 s


In [5]:
labels = []
illnessname = ['肾病','酮症','心脏病','眼病','周围神经病','足病']
for name in filenamelist:
    illness = name.split('_')[0]
    for i in range(len(illnessname)):
        if illness == illnessname[i]:
            labels.append(i+1)
            break

In [6]:
with open('stopwords.txt','r',encoding = 'UTF-8-sig') as file:
    stopwords = file.read()
    stopwords = stopwords.split('\n')

In [7]:
data_tmp = data.copy()
data = []
for line in data_tmp:
    line_tmp = line.copy()
    for word in line:
        if word in stopwords:
            line_tmp.remove(word)
    data.append(line_tmp)

## Vectorization

In [108]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]

In [172]:
%%time
model = Doc2Vec(documents, vector_size=256, window=3, min_count=10, workers=8)

  "C extension not loaded, training will be slow. "


Wall time: 4min 1s


## Training

In [173]:
def data_summary(datalabel):
    for i in range(6):
        print("Class {} in training data:".format(i+1),(datalabel == i+1).sum())

In [174]:
DocData = np.array([model.infer_vector(line) for line in data])

In [175]:
labels = np.array(labels)

In [434]:
valid_index = np.array(random.sample(range(DocData.shape[0]),300))
train_index = np.setdiff1d(np.arange(DocData.shape[0]),valid_index)

In [435]:
print("In training data before sampling:")
data_summary(labels[train_index])
print('In validation data:')
data_summary(labels[valid_index])

In training data before sampling:
Class 1 in training data: 373
Class 2 in training data: 230
Class 3 in training data: 8
Class 4 in training data: 110
Class 5 in training data: 254
Class 6 in training data: 77
In validation data:
Class 1 in training data: 94
Class 2 in training data: 69
Class 3 in training data: 7
Class 4 in training data: 32
Class 5 in training data: 75
Class 6 in training data: 23


In [436]:
train_data,train_label = imblearn.over_sampling.ADASYN(n_neighbors=5).fit_sample(DocData[train_index],labels[train_index])

In [437]:
print("In training data after sampling:")
data_summary(train_label)

In training data after sampling:
Class 1 in training data: 373
Class 2 in training data: 366
Class 3 in training data: 376
Class 4 in training data: 371
Class 5 in training data: 367
Class 6 in training data: 399


In [438]:
clf = OneVsOneClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,50,25,), random_state=1))
clf.fit(train_data,train_label)                         

OneVsOneClassifier(estimator=MLPClassifier(activation='relu', alpha=1e-05,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100, 50, 25),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_iter=200, momentum=0.9,
                                           n_iter_no_change=10,
                                           nesterovs_momentum=True, power_t=0.5,
                                           random_state=1, shuffle=True,
                                           solver='lbfgs', tol=0.0001,
                                           validation_fraction=0.1,
                                           verbose=False, warm_start=Fa

In [439]:
results = clf.predict(train_data)
print(classification_report(train_label,results,digits = 5))

              precision    recall  f1-score   support

           1    0.90323   0.67560   0.77301       373
           2    0.81728   0.90437   0.85863       366
           3    0.99208   1.00000   0.99603       376
           4    0.94366   0.72237   0.81832       371
           5    0.69803   0.86921   0.77427       367
           6    0.88393   0.99248   0.93506       399

    accuracy                        0.86234      2252
   macro avg    0.87304   0.86067   0.85922      2252
weighted avg    0.87390   0.86234   0.86054      2252



In [440]:
results = clf.predict(DocData[valid_index])
print(classification_report(labels[valid_index],results,digits = 5))

              precision    recall  f1-score   support

           1    0.60606   0.42553   0.50000        94
           2    0.50602   0.60870   0.55263        69
           3    0.50000   0.14286   0.22222         7
           4    0.87500   0.65625   0.75000        32
           5    0.44554   0.60000   0.51136        75
           6    0.33333   0.34783   0.34043        23

    accuracy                        0.52333       300
   macro avg    0.54433   0.46353   0.47944       300
weighted avg    0.54823   0.52333   0.52290       300



In [441]:
pract_data,pract_label = imblearn.over_sampling.ADASYN(n_neighbors=5).fit_sample(DocData,labels)

In [442]:
data_summary(pract_label)

Class 1 in training data: 467
Class 2 in training data: 478
Class 3 in training data: 468
Class 4 in training data: 468
Class 5 in training data: 473
Class 6 in training data: 472


In [443]:
clf = OneVsOneClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100,50,25,),random_state=1))
clf.fit(pract_data,pract_label)    
results = clf.predict(DocData)
print(classification_report(labels,results,digits = 5))

              precision    recall  f1-score   support

           1    0.90000   0.65525   0.75836       467
           2    0.72822   0.69900   0.71331       299
           3    0.13115   0.53333   0.21053        15
           4    0.80124   0.90845   0.85149       142
           5    0.71067   0.76900   0.73869       329
           6    0.64626   0.95000   0.76923       100

    accuracy                        0.73964      1352
   macro avg    0.65292   0.75250   0.67360      1352
weighted avg    0.77827   0.73964   0.74812      1352



## Deploy

In [444]:
for filename in os.listdir("test data/"):
    trancate_txt("test data/"+filename,'test_trancated/'+filename)
for filename in os.listdir('test_trancated/'):
    src = open('test_trancated/'+filename,'r',encoding = 'UTF-8-sig')
    trg = open('test_symptoms/'+filename,'w',encoding = 'UTF-8-sig')
    trg.write(src.readline())
    trg.write(src.readline())
    src.close()
    trg.close()

In [445]:
test_data = []
for name in os.listdir("test_symptoms/"):
    file = open('test_symptoms/'+name,'r',encoding = 'UTF-8-sig')
    content = file.read()
    file.close()
    content = content.translate(trans)
    content = jieba.lcut(content)
    test_data.append(' '.join(content).split())

In [446]:
data_tmp = test_data.copy()
test_data = []
for line in data_tmp:
    line_tmp = line.copy()
    for word in line:
        if word in stopwords:
            line_tmp.remove(word)
    test_data.append(line_tmp)

In [447]:
test_data = np.array([model.infer_vector(line) for line in test_data])

In [448]:
results = clf.predict(test_data)

In [449]:
namelist = os.listdir("test_symptoms/")
for i in range(len(results)):
    shutil.copyfile("test data/"+namelist[i],str(results[i])+'/'+namelist[i])