In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
df = pd.read_csv("../data/diabetic_data_no_na_diag.csv")

with open("../data/icd9Chapters.json", "r", encoding="utf-8") as f:
    idc9_chapters = json.load(f)
    
with open("../data/icd9Hierarchy.json", "r", encoding="utf-8") as f:
    idc9_hierarchy = json.load(f)

In [None]:
#!pip install icd-mappings
from icdmappings import Mapper
mapper = Mapper()
code = mapper.map('250',source='icd9', target='chapter')
print(code)
len(idc9_chapters)
type(idc9_hierarchy)
idc9_hierarchy[0]

In [None]:
'''
{'icd9': '0010',
 'descLong': 'Cholera due to vibrio cholerae',
 'descShort': 'Cholera d/t vib cholerae',
 'threedigit': '001',
 'major': 'Cholera',
 'subchapter': 'Intestinal Infectious Diseases',
 'chapter': 'Infectious And Parasitic Diseases'}
 '''

In [3]:
df['diag_1'] = df['diag_1'].apply(lambda x: '0' if pd.isna(x) else x)
df['diag_2'] = df['diag_2'].apply(lambda x: '0' if pd.isna(x) else x)
df['diag_3'] = df['diag_3'].apply(lambda x: '0' if pd.isna(x) else x)

In [None]:
from sklearn.preprocessing import LabelEncoder
import json

unique_codes = pd.concat([df['diag_1'], df['diag_2'], df['diag_3']]).unique()
print(unique_codes.shape)
le = LabelEncoder()
le.fit(unique_codes)

code_to_label = dict(zip(le.classes_, le.transform(le.classes_)))

df['diag_1'] = le.transform(df['diag_1'])
df['diag_2'] = le.transform(df['diag_2'])
df['diag_3'] = le.transform(df['diag_3'])

(916,)


In [34]:
import json
code_to_label = {k: int(v) for k, v in code_to_label.items()}
with open("../data/diag_label_encoder.json", "w") as f:
    json.dump(code_to_label, f)

In [5]:
def find_diag(idc9_hierarchy,code):
    if code == '0':
        return 'No diagnosis'
    if not pd.isna(code):
        code = str(code)
        if len(code) == 1:
            code = '00'+code
        elif len(code) == 2:
            code = '0'+code
        if '.' in code:
            code = code.replace('.','')
            for d in idc9_hierarchy:
                if d['icd9'] == code or d['icd9'] == code+'0':
                    if 'subchapter' in d.keys():
                        return {'specific':d['descLong'],'general':d['subchapter']+'. '+d['chapter']}
                    else:
                        return {'specific':d['descLong'],'general':d['chapter']}
        else:
            for d in idc9_hierarchy:
                if d['threedigit'] == code:
                    if 'subchapter' in d.keys():
                        return {'specific':d['major'],'general':d['subchapter']+'. '+d['chapter']}
                    else:
                        return {'specific':d['major'],'general':d['chapter']}
    else:
        return 'No diagnosis'
    
    print(code, 'Not found')
    return 'Not found'

In [6]:
find_diag(idc9_hierarchy,'250')

{'specific': 'Diabetes mellitus',
 'general': 'Diseases Of Other Endocrine Glands. Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders'}

In [7]:
import spacy
nlp = spacy.load("en_core_sci_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [8]:
doc = nlp("Diabetes mellitus")
embedding = doc.vector
print(type(embedding),embedding.shape)  # (200,)

<class 'numpy.ndarray'> (200,)


In [9]:
def create_embedding(nlp,desc):
    '''
    desc must be: 
    
    desc = {'specific': str,'general': str}
    or
    desc = str: 'No diag'

    '''
    if type(desc) == dict:
        emb_specific = nlp(desc['specific']).vector
        emb_general = nlp(desc['general']).vector

        emb = 0.3 * emb_general + 0.7 * emb_specific

        return emb
    else:
        return nlp(desc).vector

In [10]:
label_to_embed = dict()
for code in unique_codes:
    desc = find_diag(idc9_hierarchy,code)
    label_to_embed[code_to_label[code]] = create_embedding(nlp,desc)

In [27]:
unique_emb = np.empty((0, 200))
for k in sorted(label_to_embed.keys()):
    unique_emb = np.vstack([unique_emb, label_to_embed[k]])

In [28]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#pca = PCA(n_components=8)
tsne = TSNE(n_components=8,method='exact')

#pca = pca.fit(train)
red_emb = tsne.fit_transform(unique_emb)

In [30]:
#GUARDAMOS LA MATRIZ DE EMBEDDINGS:
np.save("../data/diag_embeddings.npy", red_emb)
#Para leer la matriz es:
#red_emb = np.load("matriz.npy")

In [31]:
df.to_csv('../data/diabetic_data_no_na_diag_cod.csv', index=False)