# ETL Medical Records

Tokenize medical records.

Template Notebook using kardiasclean.

## Part 1: Split data

1. Load data
2. Split long strings into list of strings
3. Spread list of strings into multiple rows with repeated ids (new df)

In [1]:
import pandas as pd
from pathlib import Path
from getpass import getpass

import kardiasclean

df = pd.read_csv(Path("../database/data_clean1.csv")).set_index("patient_id")
df.head()

Unnamed: 0_level_0,gender,state,municipality,altitude,age,weight_kg,height_cm,appearance,diagnosis_general,cx_previous,diagnosis_main,date_birth,date_procedure,procedure,rachs,stay_days,expired
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,Estado de México,Huixquilucan,2726,3942,35.0,134.0,Normal,Ninguno,0,"Comunicación interauricular, secundum",2001-08-22,2012-04-08,"Reparación de CIA, parche",1.0,2.0,0
1,1,Estado de México,Timilpan,2741,3202,18.0,117.0,Desnutrido,Ninguno,0,"Comunicación interauricular, secundum",2003-09-19,2012-11-08,"Reparación CIA, parche",1.0,2.0,0
2,0,Ciudad de México,Coyoacán,2240,3147,22.0,120.0,Normal,Ninguno,0,"Comunicación interauricular, secundum",2003-11-21,2012-08-18,"Reparación CIA, parche",1.0,2.0,0
3,0,Estado de México,Nezahualcoyotl,2220,4005,42.0,147.0,Normal,Ninguno,0,"Comunicación interauricular, secundum",2001-10-07,2012-08-25,"Reparación CIA, parche",1.0,2.0,0
4,0,Ciudad de México,Alvaro Obregón,2373,5289,40.0,157.0,Normal,Ninguno,0,"Comunicación Interauricular, Secundum",1997-12-22,2012-01-09,"Reparación CIA, parche",1.0,3.0,0


In [2]:
df['diagnosis_main'] = kardiasclean.split_string(df['diagnosis_main'])
df['diagnosis_main']

patient_id
0                 [Comunicación interauricular, secundum]
1                 [Comunicación interauricular, secundum]
2                 [Comunicación interauricular, secundum]
3                 [Comunicación interauricular, secundum]
4                 [Comunicación Interauricular, Secundum]
                              ...                        
1032    [Transposición de grandes vasos, Dextrocardia,...
1033    [Comunicación interauricular tipo ostium secun...
1034    [Tetralogía de Fallot con síndrome de agenesia...
1035    [Ventrículo izquierdo borderline, Estenosis su...
1037    [Tetralogía de Fallot. (TOF) -Síndrome de válv...
Name: diagnosis_main, Length: 1003, dtype: object

In [3]:
spread_df = kardiasclean.spread_column(df['diagnosis_main'])
print(kardiasclean.get_unique_stats(spread_df))
spread_df.head()

                   patient_id  diagnosis_main
unique_count      1003.000000     1006.000000
percent_of_total     0.501751        0.503252
avg_per_record       1.993021        1.987078


Unnamed: 0,patient_id,diagnosis_main
0,0,"Comunicación interauricular, secundum"
1,1,"Comunicación interauricular, secundum"
2,2,"Comunicación interauricular, secundum"
3,3,"Comunicación interauricular, secundum"
4,4,"Comunicación Interauricular, Secundum"


## Part 2: Clean and Tokenize Strings

1. Remove accents
2. Remove Symbols with regex
3. Remove stopwords
4. Tokenize with soundex

In [4]:
spread_df['diagnosis_main'] = kardiasclean.clean_accents(spread_df['diagnosis_main'])
print(kardiasclean.get_unique_stats(spread_df))
spread_df.head()

                   patient_id  diagnosis_main
unique_count      1003.000000      984.000000
percent_of_total     0.501751        0.492246
avg_per_record       1.993021        2.031504


Unnamed: 0,patient_id,diagnosis_main
0,0,"Comunicacion interauricular, secundum"
1,1,"Comunicacion interauricular, secundum"
2,2,"Comunicacion interauricular, secundum"
3,3,"Comunicacion interauricular, secundum"
4,4,"Comunicacion Interauricular, Secundum"


In [5]:
spread_df['diagnosis_main'] = kardiasclean.clean_symbols(spread_df['diagnosis_main'])
print(kardiasclean.get_unique_stats(spread_df))
spread_df.head()

                   patient_id  diagnosis_main
unique_count      1003.000000      956.000000
percent_of_total     0.501751        0.478239
avg_per_record       1.993021        2.091004


Unnamed: 0,patient_id,diagnosis_main
0,0,Comunicacion interauricular secundum
1,1,Comunicacion interauricular secundum
2,2,Comunicacion interauricular secundum
3,3,Comunicacion interauricular secundum
4,4,Comunicacion Interauricular Secundum


In [6]:
spread_df['keywords'] = kardiasclean.clean_stopwords(spread_df['diagnosis_main'])
print(kardiasclean.get_unique_stats(spread_df))
spread_df.head()

                   patient_id  diagnosis_main    keywords
unique_count      1003.000000      956.000000  924.000000
percent_of_total     0.501751        0.478239    0.462231
avg_per_record       1.993021        2.091004    2.163420


Unnamed: 0,patient_id,diagnosis_main,keywords
0,0,Comunicacion interauricular secundum,Comunicacion interauricular secundum
1,1,Comunicacion interauricular secundum,Comunicacion interauricular secundum
2,2,Comunicacion interauricular secundum,Comunicacion interauricular secundum
3,3,Comunicacion interauricular secundum,Comunicacion interauricular secundum
4,4,Comunicacion Interauricular Secundum,Comunicacion Interauricular Secundum


In [7]:
spread_df['token'] = kardiasclean.clean_tokenize(spread_df['keywords'])
print(kardiasclean.get_unique_stats(spread_df))
spread_df.head()

                   patient_id  diagnosis_main    keywords       token
unique_count      1003.000000      956.000000  924.000000  867.000000
percent_of_total     0.501751        0.478239    0.462231    0.433717
avg_per_record       1.993021        2.091004    2.163420    2.305652


Unnamed: 0,patient_id,diagnosis_main,keywords,token
0,0,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
1,1,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
2,2,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
3,3,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
4,4,Comunicacion Interauricular Secundum,Comunicacion Interauricular Secundum,KMNKSNNTRRKLRSKNTM


## Part 3: Get Unique List

1. Get Unique Values from the spread dataframe
2. Normalize the spread dataframe with the new unique list

In [8]:
list_df = kardiasclean.create_unique_list(spread_df, spread_df['token'])
list_df = list_df.drop(["patient_id", "index"], axis=1)
list_df.head()

Unnamed: 0,diagnosis_main,keywords,token
0,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
1,Comunicacion interventricular Tipo 2 perimembr...,2 Comunicacion Tipo interventricular perimembr...,KMNKSNTPNTRFNTRKLRPRMMPRNS
2,Estenosis aortica subvalvular,Estenosis aortica subvalvular,ASTNSSRTKSPFLFLR
3,Comunicacion interventricular Tipo 1 infundibular,1 Comunicacion Tipo infundibular interventricular,KMNKSNTPNFNTPLRNTRFNTRKLR
4,Comunicacion interauricular Secundum,Comunicacion Secundum interauricular,KMNKSNSKNTMNTRRKLR


In [9]:
spread_df['diagnosis_main'] = kardiasclean.normalize_from_tokens(spread_df['token'], list_df['token'], list_df['diagnosis_main'])
spread_df = spread_df.set_index("patient_id")
spread_df.head()

Unnamed: 0_level_0,diagnosis_main,keywords,token
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
1,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
2,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
3,Comunicacion interauricular secundum,Comunicacion interauricular secundum,KMNKSNNTRRKLRSKNTM
4,Comunicacion interauricular secundum,Comunicacion Interauricular Secundum,KMNKSNNTRRKLRSKNTM


## Part 4: Store in SQL

1. NOTE: Create a database in Postgres first!
2. Rename columns if necessary.
3. Use pandas and replace, NO NEED FOR SCHEMA (CREATE TABLE ...).

In [10]:

password = getpass('Enter database password')
host = "kardias-test.cvj7xeynbmtt.us-east-1.rds.amazonaws.com"
pgm = kardiasclean.PostgresManager("kardias", password, host)

In [11]:
# STORE MAIN DATA
pgm.create_table("patients", df).count()

patient_id           1003
gender               1003
state                1003
municipality         1003
altitude             1003
age                  1003
weight_kg            1003
height_cm            1003
appearance           1003
diagnosis_general    1003
cx_previous          1003
diagnosis_main       1003
date_birth           1003
date_procedure       1003
procedure            1003
rachs                1003
stay_days            1003
expired              1003
dtype: int64

In [15]:
# STORE LIST DATA
#list_df = list_df.set_index("token")
pgm.create_table("diagnosis_main", list_df).count()

token             867
diagnosis_main    867
keywords          867
dtype: int64

In [16]:
# STORE SPREAD DATA
spread_df = spread_df.drop(columns=["diagnosis_main", "keywords"])
pgm.create_table("diagnosis_main_map", spread_df).count()

patient_id    1999
token         1999
dtype: int64

## DONE!