In [195]:
# import dependencies
import pandas as pd
import kardiasclean

In [196]:
# import csv
df = pd.read_csv("../../database/clean2/db_patient_diagnosis_general.csv")
df

Unnamed: 0,patient_id,diagnosis_general
0,0,Ninguno
1,1,Ninguno
2,2,Ninguno
3,3,Ninguno
4,4,Ninguno
...,...,...
1420,1035,Hidrocele bilateral
1421,1035,Hijo de madre con diabetes gestacional
1422,1035,Hiperbilirrubinemia\r\nmultifactorial remitida
1423,1037,Displasia broncopulmonar


In [197]:
# add column name "categories"
df ["categories_general"] = df ["diagnosis_general"]
df

Unnamed: 0,patient_id,diagnosis_general,categories_general
0,0,Ninguno,Ninguno
1,1,Ninguno,Ninguno
2,2,Ninguno,Ninguno
3,3,Ninguno,Ninguno
4,4,Ninguno,Ninguno
...,...,...,...
1420,1035,Hidrocele bilateral,Hidrocele bilateral
1421,1035,Hijo de madre con diabetes gestacional,Hijo de madre con diabetes gestacional
1422,1035,Hiperbilirrubinemia\r\nmultifactorial remitida,Hiperbilirrubinemia\r\nmultifactorial remitida
1423,1037,Displasia broncopulmonar,Displasia broncopulmonar


In [198]:
# check the number of patients of each general diagnosis
df["diagnosis_general"].value_counts()

Ninguno                             497
Trisomia 21                         109
Sindrome de Down                     44
Hipotiroidismo                       21
Hipotiroidismo congenito             18
                                   ... 
Ictericia secundaria a lactancia      1
Rinon en herradura                    1
PO de craneosinostosis                1
Talasemia menor                       1
Desnutricion leve por P/T             1
Name: diagnosis_general, Length: 578, dtype: int64

In [199]:
# dictionary of categories and regex patterns
reduced_diagnosis = {
    "Trisomia 21" : [
        ".*[Tt]risomia .*",
        ".*[Tt]isomia .*",
        ".*[Dd]own.*"
    ],
    "Hipotiroidismo" : [
        ".*[Hh]ipotiroidismo.*"
      ],
    "Paciente con Sindrome" : [
        ".*^[Ss]indrome [Dd]ismorfologico.*",
        ".*^[Ss]indroma [Dd]ismorfologico.*",
        ".*^[Ss]x [Dd]ismorfologico.*",
        ".*[Ss]indrome.*"
    ],
    "Epilepsia" : [
        ".*[Ee]pilepsia.*"
    ],
    "Ninguno" : [
        ".*[Nn]inguno.*"
    ],
    "Neumonia" : [
        ".*^[Nn]eumonia.*"
    ],
    "Paciente con Antecedentes" : [
        ".*^[Aa]ntecedente.*",
        ".*[Pp]aciente.*",
        ".*Antedecedente.*"
    ],
    "Desnutricion" : [
        ".*[Dd]esnutricion.*",
        ".*[pP]eso [Bb]ajo.*",
        ".*[Bb]ajo [pP]eso.*",
        ".*Peso y Talla baja.*"
    ],
    "Heterotaxia" : [
        ".*Heterotaxia.*"
    ],
    "Displasia" : [
        ".*Displasia.*"
    ],
    "Malformacion" : [
        ".*[mM]alformacion.*"
    ],
    "Madre con Antecedentes" : [
        ".*[mM]adre.*"
    ],
    "Hipertension" : [
        ".*[Hh]ipertension.*",
        ".*[hH][Pp].*",
        "HAP"
    ],
    "Funduplicatura de Nissen" : [
        ".*[nN]issen.*"
    ],
    "Retraso" : [
        ".*[rR]etraso.*"
    ],
    "Escoliosis" : [
        ".*[Ee]scoliosis.*"
    ],
    "Reflujo" :[
        ".*[Rr]eflujo.*",
        ".*ERGE.*"
    ],
    "Dermatitis" : [
        ".*[dD]ermatitis.*"
    ]
    }

In [200]:
# check the name of the rows and change it with the dictionary
for category in reduced_diagnosis :
    for pattern in reduced_diagnosis[category] :
        df['diagnosis_general'] = df['diagnosis_general'].str.replace(pattern, category)

  after removing the cwd from sys.path.


In [201]:
# verify the number of columns
df['diagnosis_general'].value_counts()

Ninguno                                           498
Trisomia 21                                       160
Paciente con Antecedentes                         101
Hipotiroidismo                                     46
Retraso                                            41
                                                 ... 
Restriccion de crecimiento intrauterino             1
Peso adecuado para la edad gestacional              1
Subclavia derecha aberrante                         1
paro cardiorespiratorio 10 min                      1
Hiperbilirrubinemia\r\nmultifactorial remitida      1
Name: diagnosis_general, Length: 389, dtype: int64

In [202]:
# variable to verify the number of rows repeated
check = df['diagnosis_general'].str.findall(".*[Ee]nfermedad.*")
check.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                                                                                                            1419
[Enfermedad de Hirschsprung]                                                                                     2
[Enfermedad renal cronica]                                                                                       1
[A descartar enfermedad fungica invasiva actualmente sin aislamiento en hemocultivos seriados para hongos]       1
Name: diagnosis_general, dtype: int64

In [203]:
# save csv for machine learning
df.to_csv("../../database/clean3/general_diagnosis_categories.csv", index=False)


In [204]:
# check the data and save final csv
THRESHOLD = 0.98
low, high = kardiasclean.perform_frequency_split_quantile(df['categories_general'], THRESHOLD)
print(kardiasclean.evaluate_distribution(high, low))

df['categories_general'] = kardiasclean.perform_binning_quantile(df['categories_general'], THRESHOLD)
df.to_csv("../../database/clean3/other_final_general_diag.csv", index=False)
df

Total data (repeated): 1423
--------------------
Unique high frequency data: 13
Unique low frequency data: 565
--------------------
Total high frequency data: 749
Total low frequency data: 674
--------------------
Percentage of high data: 52.64%
Percentage of low data: 47.36%
--------------------
Summary
--------------------
From 1423 data, 13 account for 52.64% of the total count.



Unnamed: 0,patient_id,diagnosis_general,categories_general
0,0,Ninguno,Ninguno
1,1,Ninguno,Ninguno
2,2,Ninguno,Ninguno
3,3,Ninguno,Ninguno
4,4,Ninguno,Ninguno
...,...,...,...
1420,1035,Hidrocele bilateral,Other
1421,1035,Madre con Antecedentes,Other
1422,1035,Hiperbilirrubinemia\r\nmultifactorial remitida,Other
1423,1037,Displasia,Other
