In [None]:
# 라이브러리

import pandas as pd
import numpy as np
import pickle
pd.set_option('mode.chained_assignment',None)

# 데이터로드 및 단순 전처리
# description(des)파일에는 concept에 대한 영문 설명이 있음(사람이 읽을 수 있게)
# relationship(rel)파일에는 source와 destination으로 된 그물 형식의 트리 구조로 되어 있음. 단순 트리는 아님을 유의할 것.
df_description = pd.read_csv('sct2_Description_Snapshot-en_INT_20210131.txt', sep='\t')
df_relation = pd.read_csv('sct2_Relationship_Snapshot_INT_20210131.txt', sep='\t')

df_description.drop(['effectiveTime', 'moduleId', 'languageCode','caseSignificanceId'], axis=1, inplace=True)
df_relation.drop(['effectiveTime', 'moduleId', 'relationshipGroup', 'characteristicTypeId', 'modifierId'], axis=1, inplace=True)

df_description = df_description[df_description['active'] == 1]
df_relation = df_relation[df_relation['active'] == 1]

df_description['term'] = df_description['term'].str.replace(r"[\\[].*?[\\]]","")
df_description['term'] = df_description['term'].str.rstrip()


In [None]:
# 재귀함수로 sourceId 및 destinationId등을 체크하면서 트리구조에서 상위 구조에 맞는 애들을 다 모아서 모든 해당하는 트리 구조들을
# finding_result로 담아준다.
# 즉, realation 의 destinationId 컬럼 및, sourceId 컬럼 중에서 새로운애들이 발견되면 계속 append해서 finding_result에 넣어준다.
def finding(df_relation, list_finding, finding_result):
    finding_new = df_relation[(df_relation['destinationId'].isin(list_finding) == True) & (df_relation['typeId'] == 116680003)]['sourceId'].values
    if len(finding_new) != 0:
        finding_result.append(finding_new)
        finding(df_relation, finding_new, finding_result)
    return finding_result
    
def preprocessor(df_description, df_relation, kind):    
    if kind == 'body_structure':
            kind_id = [123037004]
    elif kind == 'clinical_finding':
            kind_id = [404684003]
    elif kind == 'procedure':
            kind_id = [71388002]
    elif kind == 'drug1':
            kind_id = [410942007]
    elif kind == 'drug2' :
            kind_id = [373873005]
    ## 1. df_description중에서 df_relation의 타입아이디 칼럼 중 고유값 포함되지 않는 것들만 필터링 함
    rel_unique_list = df_relation['typeId'].unique()
    df_description = df_description[~df_description['conceptId'].isin(rel_unique_list)]
    df_rel_finding = df_relation[(df_relation['destinationId'].isin(kind_id)) & (df_relation['typeId'] == 116680003)] #116680003 == is a 
    sourceId = df_rel_finding['sourceId'].values
    total_finding = finding(df_relation, sourceId, [sourceId]) #finding 함수 사용
    preprocessed_total_finding = np.concatenate(total_finding, axis=0)
    preprocessed_total_finding = np.unique(preprocessed_total_finding) #finding 함수에서 상위로 향하는 가지를 연쇄적으로 목표로해서 밑에있는 애들 리스트로 몰아넣으므로, 중복내용 제거

    ## 2. 증상(sympotom)과 관련된 des만 필터링 시작
    df_des_symptom = df_description[df_description['conceptId'].isin(preprocessed_total_finding)]
    df_des_symptom['disorder'] = [int(1) if '(disorder)' in term else int(0) for term in df_des_symptom['term'] ]

    ## 3. 불필요한 내용 제거 및 단어(term)처리
    df_des_symptom['term'] = df_des_symptom['term'].str.replace(r"[\\[].*?[\\]]","")
    df_des_symptom['term'] = df_des_symptom['term'].str.rstrip()
    df_des_symptom.drop(['active'], axis=1, inplace= True)
    df_des_symptom['typeId'].replace([900000000000013009,900000000000003001],['Synonym','FSN'], inplace=True)
    df_synonym=df_des_symptom[df_des_symptom['typeId'] == 'Synonym']
    df_synonym['word'] = [len(term.split(' ')) for term in df_synonym['term']]
    df_synonym = df_synonym[df_synonym['word'] <7]

    ## 4. FSN과 symptom의 typeID 칼럼 매칭하여서 df_fsn 객체에 담음
    df_fsn=df_des_symptom[df_des_symptom['typeId'] == 'FSN']    
    df_fsn['synonym'] = [df_synonym[df_synonym['conceptId'] == conceptId]['term'].values for conceptId in df_fsn['conceptId']]
    df_fsn['word'] = [len(term.split(' ')) for term in df_fsn['term']]

    ## 5. df_fsn1에     lambda(list - term) = list(syn) (중복제거)
    # df_fsn1에 있는 synonym 칼럼 - term 칼럼으로 중복내용 제거해서 synonym칼럼에 담음.
    # 즉, term은 synonym칼럼과 비교하였을때, 고윳값만 담겨있는 칼럼이 되고, term을 제외한 비슷한 유사어구가 synonym 칼럼에 담기게 됨
    df_fsn['synonym'] = df_fsn['synonym'].apply(lambda x : x.tolist())
    df_fsn['term'] = df_fsn['term'].apply(lambda x : x.split('???'))
    df_fsn['synonym'] = [set(i)-set(j) for i, j in zip(df_fsn['synonym'], df_fsn['term'])]
    df_fsn['term'] = df_fsn['term'].apply(lambda x : ' '.join(list(x)))

    ## 6. "Finding site", "Procedure site" 전처리 진행
    # df_rel['typeId'] == 363698007 >>>>>> "Finding site"로 정의함
    # df_rel['typeId'] == 116680003 >>>>>> "Procedure site"로 정의함
    # 363698007 == Finding site (attribute) 속성값 / 116680003 == is a    
    df_rel_body = df_relation[(df_relation['typeId'] == 363698007) | (df_relation['typeId'] == 116680003)]
    df_fsn['isFSN'] = '1'
    df_synonym['isFSN'] = '0'
    df_preprocessed_symptom = pd.concat([df_fsn,df_synonym], axis=0)
    df_preprocessed_symptom.drop_duplicates(['term'], inplace=True)

    # 7. conceptId와 모든 아웃풋에 활용될 'finding site','finding'에 들어갈 내용 필터링 뒤에 이후 전처리에 활용
    conceptid = df_preprocessed_symptom['conceptId'].tolist()
    df_rel_body = df_relation[(df_relation['typeId'] == 363698007) | (df_relation['typeId'] == 116680003)]
    df_rel_body = df_rel_body[(df_rel_body['sourceId'].isin(conceptid) == True)]
    df_rel_body.reset_index(inplace=True)
    df_rel_body.drop('index',axis=1, inplace=True)

    # 8. finding site, procedure site 찾기
    #df_rel_body1 == finding site가 있는 concepid / Procedure를 위한 procedure site
    #df_rel_body2 == finding site가 없는 concepid / 동시에 parents node인 애들 
    df_rel_body1 = df_rel_body[df_rel_body['typeId'] == 363698007]   # 363698007 == Finding site (attribute) 속성값 
    sourceid = df_rel_body1['sourceId'].values.tolist()
    df_rel_body2 = df_rel_body[df_rel_body['typeId'] == 116680003]   # 116680003 == is a (속성을 나타내주는 값)

    # 9. finding site가 없는 conceptid의 노드들만 찾음
    # body1에 sourceId에 포함되지 않는 애들을 body2의 애들만 담음.
    df_rel_body2 = df_rel_body2[df_rel_body2['sourceId'].isin(sourceid) ==  False]

    #(9-1)~(9-4)까지 df_preprocessed_symptom에 조건을 걸어 칼럼들을 추가함.
    #df_preprocessed_symptom == df_fsn을 담아서, df_rel_body1, df_rel_body2를 이용하여 전처리함.

    #9-1
    conceptid = df_preprocessed_symptom['conceptId'].tolist()
    #9-2
    df_preprocessed_symptom['finding site'] = [df_rel_body1['destinationId'].loc[df_rel_body1['sourceId'] == copceptId].tolist() if conceptid in df_rel_body1['sourceId'].tolist() 
                        else df_rel_body2['destinationId'].loc[df_rel_body2['sourceId'] == copceptId].tolist() for copceptId in df_preprocessed_symptom['conceptId'].tolist()]
    #9-3
    df_preprocessed_symptom['finding'] = [list(set(df_description['term'].loc[df_description['conceptId'] == fs[0]].tolist()))[0] if len(fs)!=0 
                                          else None
                                          for fs in df_preprocessed_symptom['finding site']]

    #9-4 : 138875005 == SNOMED CT Concept, 최상위 노드 /  116680003 == is a (속성을 나타내주는 값)
    df_rel_depth = df_relation[(df_relation['destinationId'] == 138875005) & (df_relation['typeId'] == 116680003)]

    #10. finding site의 id면서 138875005가 destinationid인것을 찾음
    df_rel_depth = df_relation[(df_relation['destinationId'] == 138875005) & (df_relation['typeId'] == 116680003)] #138875005 == SNOMED CT Concept, 최상위 노드 /  116680003 == is a (속성을 나타내주는 값)
    findingsite_list = df_preprocessed_symptom['finding site'].values
    findingsite_list = findingsite_list.tolist()
    df_rel_depth = df_relation[(df_relation['destinationId'] == 138875005) & (df_relation['typeId'] == 116680003)]
    sourceId_list = df_rel_depth['sourceId'].tolist()

    #11.depth 수 세기
    df_depthcount_depthId = df_preprocessed_symptom.iloc[:,11:30]
    df_depthcount_depthId['depth'] = df_depthcount_depthId.count(axis=1)

    #12. 여러개의 finding site중에서 1번째 목록만 담아서 finding site 칼럼에 담음
    df_preprocessed_symptom['finding site'] = [finding_sites[0] if len(finding_sites)!=0 else None for finding_sites in df_preprocessed_symptom['finding site']]

    #13. simple dep+str(i) 생성
    for i in range(1,31):   
        df_preprocessed_symptom['depth'+str(i)] = None

    #14. depth col maker / 각 뎁스에 대응하는 영어를 dep1~dep30까지 생성한다.
    for i in range(len(df_preprocessed_symptom)):
        lst2 = []
        if (df_relation['destinationId'].loc[df_relation['sourceId'] == df_preprocessed_symptom['conceptId'].iloc[i]] != None).all():
            for j in range(1,31):

                if j == 1:
                    x = df_relation['destinationId'].loc[df_relation['sourceId'] == df_preprocessed_symptom['conceptId'].iloc[i]].values[0]
                if x == 138875005:
                    continue
                lst2.append(df_description['term'].loc[df_description['conceptId'] == x].values[0])
                x = df_relation['destinationId'].loc[df_relation['sourceId'] == x].values[0]

        for l in range(len(lst2)):
            df_preprocessed_symptom['depth'+str(len(lst2)-l)].iloc[i] = lst2[l]

    #15. depth의 수를 세주는 함수
    df_depthcount_depthId = df_preprocessed_symptom.iloc[:,10:40]
    df_preprocessed_symptom['depth'] = df_depthcount_depthId.count(axis=1)

    depth_list = []
    for i in range(10,40) :
        depth_list = depth_list + list(dict.fromkeys(df_preprocessed_symptom.iloc[:,i].tolist()))
    depth_list = [x for x in depth_list if str(x) != 'None']
    depth_list = list(set(depth_list))
    replace_list = dict(zip(df_description['term'], df_description['conceptId']))
    dpeth_concepId = [replace_list[depth_des] for depth_des in depth_list]
    small_dict = dict(zip(depth_list, dpeth_concepId))
    df_preprocessed_symptom =df_preprocessed_symptom.fillna('0')

    for i in range(1,31):
        df_preprocessed_symptom['depth'+str(i)+'Id'] = [ small_dict[depth_des] if depth_des != '0' else '0' for depth_des in df_preprocessed_symptom['depth'+str(i)]]    

    return df_preprocessed_symptom

In [None]:
%%time
# 5분 소요
drug = preprocessor(df_description, df_relation, 'drug1')
drug.to_csv("drug.csv")
del drug

In [None]:
%%time

body_structure = preprocessor(df_description, df_relation, 'body_structure')
body_structure.to_csv("body_structure.csv")
del body_structure

In [None]:
%%time
# 1시간 소요
procedure = preprocessor(df_description, df_relation, 'procedure')
procedure.to_csv("procedure.csv")
del procedure

In [None]:
%%time

clinical_finding = preprocessor(df_description, df_relation, 'clinical_finding')
clinical_finding.to_csv("clinical_finding.csv")
del clinical_finding