In [1]:
import pandas as pd
import numpy as np
import jieba; jieba.initialize()
from collections import Counter
import re

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/l6/b9fxvmjn0qj4340p4p7mzptw0000gn/T/jieba.cache
Loading model cost 0.661 seconds.
Prefix dict has been built succesfully.


In [2]:
df = pd.read_excel("../data/所有患者信息汇总（删除未表明手术日期并存活至今的患者）.xls",encoding='gbk')
print(df.shape)
df.head()

(104, 3)


Unnamed: 0,主诉,手术（日期+术式）,生存期
0,头痛恶心伴视物模糊2个月。\n,2017-08-08 神经导航引导下，右额开颅胶质瘤切除术,11.5
1,右下肢疼痛1月余，伴左下肢活动不灵半个月。,2016.04.11\n神经导航下右顶枕开颅右侧侧脑室胶质瘤切除术,11.0
2,言语不清10天余。,2016.10.17术中超声引导下 左侧额叶胶质母细胞瘤切除术\n\n,12.0
3,发现右侧额叶占位病变1个月。,2015.12.04\n冠状切口 右侧额叶胶质瘤切除术\n,12.0
4,精神萎靡伴双下肢无力2天。,2018年4月28日 半冠状切口右额开颅右额占位切除术+颅骨修补术,14.0


# 主诉+词频

In [3]:
# remove trailing chars: '。\n'
df['主诉'] = df['主诉'].map(lambda x: x.rstrip('。\n'))
df.head()

Unnamed: 0,主诉,手术（日期+术式）,生存期
0,头痛恶心伴视物模糊2个月,2017-08-08 神经导航引导下，右额开颅胶质瘤切除术,11.5
1,右下肢疼痛1月余，伴左下肢活动不灵半个月,2016.04.11\n神经导航下右顶枕开颅右侧侧脑室胶质瘤切除术,11.0
2,言语不清10天余,2016.10.17术中超声引导下 左侧额叶胶质母细胞瘤切除术\n\n,12.0
3,发现右侧额叶占位病变1个月,2015.12.04\n冠状切口 右侧额叶胶质瘤切除术\n,12.0
4,精神萎靡伴双下肢无力2天,2018年4月28日 半冠状切口右额开颅右额占位切除术+颅骨修补术,14.0


In [4]:
# words that should not be split during tokenization
whole_words = {'一过性','发作性','伴','不自主活动','面部','视物'}
for word in whole_words:
    jieba.suggest_freq(word, True)
jieba.del_word('胶质瘤')

In [5]:
words = []

# tokenize each complaint and put all tokens in the same list
for complaint in df['主诉']:
    seg_list = jieba.lcut(complaint, cut_all=False)
    words.extend(seg_list)

In [6]:
def is_stopword(token: str) -> bool:
    """ return if a word is a stop word """
    
    stopwords = {'，', '。', '、', ',', 
                 '小时','天', '月', '周', '年', 
                 '个', '伴', '余天', '无', '近',
                 '左', '右', '左侧', '右侧','不能',
                 '年余','月余','天余', '余','术后',
                 '发现','反复','加重','突发','半个',
                 '半月', '半年'}
    if token.isdigit():
        return True
    if token in stopwords:
        return True
    return False

In [7]:
c = Counter(words) # count the frequency of each token
high_freq = []
for pair in c.most_common():
    if not is_stopword(pair[0]):
        high_freq.append(pair)

# tokens that appear more than 3 times
high_freq = [x[0] for x in high_freq if x[1] >= 3]
print(high_freq)

['头痛', '肢体', '头晕', '瘤', '胶质', '意识', '无力', '活动', '呕吐', '发作性', '抽搐', '恶心', '不灵', '占位', '下肢', '言语', '丧失', '颅内', '麻木', '视物', '不清', '母细胞', '模糊', '障碍', '乏力', '记忆力', '病变', '双下肢', '进行性', '颞', '间断', '右额', '上肢', '细胞', '不稳', '走路', '发作']


## 人工调整

In [8]:
# remove tokens that have been covered by others
remove_list = {'肢体', '母细胞', '胶质', '下肢', '不灵',
               '不清', '颅内', '障碍', '发作性', '丧失',
               '上肢', '视物', '进行性', '颞叶','颞',
               '复发', '病变', '双下肢', '间断', '右额',
               '细胞', '走路', '发作'}
high_freq = [x for x in high_freq if x not in remove_list]

# add the two tokens that are probabaly useful but only appear twice
high_freq.append('失禁')
print(high_freq)

['头痛', '头晕', '瘤', '意识', '无力', '活动', '呕吐', '抽搐', '恶心', '占位', '言语', '麻木', '模糊', '乏力', '记忆力', '不稳', '失禁']


In [9]:
# create a dictionary that has 
# the tokens to be recognized in text as values,
# and tokens' proper names as keys 
kw_dict = {}

kw_dict['头痛'] = {'头痛'}
kw_dict['头晕'] = {'头晕'}
kw_dict['胶质瘤手术'] = {'瘤'}
kw_dict['意识障碍'] = {'意识'}
kw_dict['肢体无力'] = {'无力','乏力'}
kw_dict['肢体活动不灵'] = {'活动'}
kw_dict['呕吐'] = {'呕吐'}
kw_dict['肢体抽搐'] = {'抽搐'}
kw_dict['恶心'] = {'恶心'}
kw_dict['占位性病变'] = {'占位'}
kw_dict['言语障碍'] = {'言语'}
kw_dict['肢体麻木'] = {'麻木'}
kw_dict['视物模糊'] = {'模糊','重影'}
kw_dict['记忆力减退'] = {'记忆力'}
kw_dict['行走不稳'] = {'不稳'}
kw_dict['大小便失禁'] = {'失禁'}

kw_dict

{'头痛': {'头痛'},
 '头晕': {'头晕'},
 '胶质瘤手术': {'瘤'},
 '意识障碍': {'意识'},
 '肢体无力': {'乏力', '无力'},
 '肢体活动不灵': {'活动'},
 '呕吐': {'呕吐'},
 '肢体抽搐': {'抽搐'},
 '恶心': {'恶心'},
 '占位性病变': {'占位'},
 '言语障碍': {'言语'},
 '肢体麻木': {'麻木'},
 '视物模糊': {'模糊', '重影'},
 '记忆力减退': {'记忆力'},
 '行走不稳': {'不稳'},
 '大小便失禁': {'失禁'}}

## 创建特征集

In [10]:
one_hot_mtx = np.zeros((len(df), len(kw_dict)))  # 64*14 matrix
df1h = pd.DataFrame(one_hot_mtx, columns=kw_dict.keys())
df1h.head()  # one-hot matrix

Unnamed: 0,头痛,头晕,胶质瘤手术,意识障碍,肢体无力,肢体活动不灵,呕吐,肢体抽搐,恶心,占位性病变,言语障碍,肢体麻木,视物模糊,记忆力减退,行走不稳,大小便失禁
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# build one-hot vector for each instance
for row in df.iterrows():
    index = row[0]  # index
    cc = row[1][0]  # chief complaint
    for key, value in kw_dict.items():
        
        # if complaint contains any keyword, set the one-hot to 1
        if any(kw in cc for kw in value):
            df1h.loc[index, key] = 1

df1h.head()

Unnamed: 0,头痛,头晕,胶质瘤手术,意识障碍,肢体无力,肢体活动不灵,呕吐,肢体抽搐,恶心,占位性病变,言语障碍,肢体麻木,视物模糊,记忆力减退,行走不稳,大小便失禁
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# append labels to the dataframe
df1h = df1h.join(df['生存期'], how='inner')
print(df1h.shape)
df1h.head()

(104, 17)


Unnamed: 0,头痛,头晕,胶质瘤手术,意识障碍,肢体无力,肢体活动不灵,呕吐,肢体抽搐,恶心,占位性病变,言语障碍,肢体麻木,视物模糊,记忆力减退,行走不稳,大小便失禁,生存期
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.5
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0


In [13]:
# write to file
df1h.to_excel("../data/one-hot矩阵.xls", index=False)

# 手术日期/文字 分开

In [13]:
df.rename(columns={'手术（日期+术式）':'手术'}, inplace=True)

In [14]:
def date_match(text: str) -> pd.datetime:
    try:
        date = re.search(r'(\d{4})[-.年](\d{1,2})[-.月](\d{1,2})[-.日]?', text)
        date = f"{'.'.join(date.groups())}"
        return pd.to_datetime(date)
    except AttributeError:
        return None

def remove_date(text: str) -> str:
    try:
        date = re.search(r'(\d{4})[-.年](\d{1,2})[-.月](\d{1,2})[-.日]?\W*', text)
        return text[date.end():].rstrip()
    except AttributeError:
        return text

In [15]:
# extract date from '手术' column
dates = df['手术'].map(date_match)
dates
df.insert(1, '手术日期', dates)

In [16]:
# remove date from '手术' column
df['手术'] = df['手术'].map(remove_date)

In [17]:
df.head()

Unnamed: 0,主诉,手术日期,手术,生存期
0,头痛恶心伴视物模糊2个月,2017-08-08,神经导航引导下，右额开颅胶质瘤切除术,11.5
1,右下肢疼痛1月余，伴左下肢活动不灵半个月,2016-04-11,神经导航下右顶枕开颅右侧侧脑室胶质瘤切除术,11.0
2,言语不清10天余,2016-10-17,术中超声引导下 左侧额叶胶质母细胞瘤切除术,12.0
3,发现右侧额叶占位病变1个月,2015-12-04,冠状切口 右侧额叶胶质瘤切除术,12.0
4,精神萎靡伴双下肢无力2天,2018-04-28,半冠状切口右额开颅右额占位切除术+颅骨修补术,14.0


In [18]:
excel_writer = pd.ExcelWriter("../output/患者信息汇总（手术日期+文字分开）.xls", date_format='YYYY-MM-DD')
df.to_excel(excel_writer, index=False)