In [2]:
import pandas as pd
import json
import numpy as np

In [2]:
df = pd.read_csv("IDSS-for-Diabetes-Readmission-Prediction/data/diabetic_data_no_na_2.csv")

with open("icd9Chapters.json", "r", encoding="utf-8") as f:
    idc9_chapters = json.load(f)
    
with open("icd9Hierarchy.json", "r", encoding="utf-8") as f:
    idc9_hierarchy = json.load(f)

In [3]:
#!pip install icd-mappings
from icdmappings import Mapper

In [4]:
mapper = Mapper()
code = mapper.map('250',source='icd9', target='chapter')
print(code)

3


In [5]:
len(idc9_chapters)
type(idc9_hierarchy)
idc9_hierarchy[0]

{'icd9': '0010',
 'descLong': 'Cholera due to vibrio cholerae',
 'descShort': 'Cholera d/t vib cholerae',
 'threedigit': '001',
 'major': 'Cholera',
 'subchapter': 'Intestinal Infectious Diseases',
 'chapter': 'Infectious And Parasitic Diseases'}

In [7]:
def find_diag(idc9_hierarchy,code):
    if not pd.isna(code):
        code = str(code)
        if len(code) == 1:
            code = '00'+code
        elif len(code) == 2:
            code = '0'+code
        if '.' in code:
            code = code.replace('.','')
            for d in idc9_hierarchy:
                if d['icd9'] == code or d['icd9'] == code+'0':
                    if 'subchapter' in d.keys():
                        return {'specific':d['descLong'],'general':d['subchapter']+'. '+d['chapter']}
                    else:
                        return {'specific':d['descLong'],'general':d['chapter']}
        else:
            for d in idc9_hierarchy:
                if d['threedigit'] == code:
                    if 'subchapter' in d.keys():
                        return {'specific':d['major'],'general':d['subchapter']+'. '+d['chapter']}
                    else:
                        return {'specific':d['major'],'general':d['chapter']}
    else:
        return 'No diag'
    
    print(code, 'Not found')
    return 'Not found'

In [8]:
find_diag(idc9_hierarchy,'250.1')

{'specific': 'Diabetes with ketoacidosis, type II or unspecified type, not stated as uncontrolled',
 'general': 'Diseases Of Other Endocrine Glands. Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders'}

In [9]:
df['desc_diag_1'] = df['diag_1'].apply(lambda x: find_diag(idc9_hierarchy,x))
df['desc_diag_2'] = df['diag_2'].apply(lambda x: find_diag(idc9_hierarchy,x))
df['desc_diag_3'] = df['diag_3'].apply(lambda x: find_diag(idc9_hierarchy,x))

In [10]:
df['desc_diag_1'][2]

{'specific': 'Secondary malignant neoplasm of respiratory and digestive systems',
 'general': 'Malignant Neoplasm Of Other And Unspecified Sites. Neoplasms'}

In [None]:
import spacy
nlp = spacy.load("en_core_sci_md")

In [23]:
doc = nlp("Diabetes mellitus")
embedding = doc.vector
print(type(embedding),embedding.shape)  # (200,)

<class 'numpy.ndarray'> (200,)


In [28]:
def create_embedding(nlp,desc):
    '''
    desc must be: 
    
    desc = {'specific': str,'general': str}
    or
    desc = str: 'No diag'

    '''
    if type(desc) == dict:
        emb_specific = nlp(desc['specific']).vector
        emb_general = nlp(desc['general']).vector

        emb = 0.3 * emb_general + 0.7 * emb_specific

        return emb
    else:
        return nlp(desc).vector

In [29]:
df['emb_daig_1'] = df['desc_diag_1'].apply(lambda x: create_embedding(nlp,x))
df['emb_daig_2'] = df['desc_diag_2'].apply(lambda x: create_embedding(nlp,x))
df['emb_daig_3'] = df['desc_diag_3'].apply(lambda x: create_embedding(nlp,x))

In [30]:
df.to_csv('diabetic_data_embed_200.csv', index=False)

In [65]:
all_emb = np.vstack([np.vstack(df['emb_daig_1'].values), np.vstack(df['emb_daig_2'].values), np.vstack(df['emb_daig_3'].values)])
unique_emb = np.unique(all_emb, axis=0)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Supongamos que `embedding` es un arreglo de 200 dimensiones (por ejemplo, un numpy array)
all_emb = np.vstack([np.vstack(df['emb_daig_1'].values), np.vstack(df['emb_daig_2'].values), np.vstack(df['emb_daig_3'].values)])
unique_emb = np.unique(all_emb, axis=0)
#pca = PCA(n_components=8)
tsne = TSNE(n_components=8,method='exact')

#pca = pca.fit(train)
red_emb = tsne.fit_transform(unique_emb)

In [71]:
map_emb = {tuple(unique_emb[i]):red_emb[i] for i in range(len(unique_emb))}

In [72]:
df['red_emb_daig_1'] = df['emb_daig_1'].apply(tuple).map(map_emb)
df['red_emb_daig_2'] = df['emb_daig_2'].apply(tuple).map(map_emb)
df['red_emb_daig_3'] = df['emb_daig_3'].apply(tuple).map(map_emb)

In [73]:
df.to_csv('diabetic_data_embed_200_and_8.csv', index=False)

In [77]:
df = df.drop(['emb_daig_1', 'emb_daig_2', 'emb_daig_3'], axis=1)

In [78]:
df.to_csv('diabetic_data_embed_8.csv', index=False)