In [106]:
# import dependencies
import pandas as pd
import kardiasclean

In [107]:
# import csv
df = pd.read_csv("../../database/clean2/db_patient_diagnosis_main.csv")
df

Unnamed: 0,patient_id,diagnosis_main
0,0,Comunicacion interauricular secundum
1,1,Comunicacion interauricular secundum
2,2,Comunicacion interauricular secundum
3,3,Comunicacion interauricular secundum
4,4,Comunicacion interauricular secundum
...,...,...
1994,1035,Conducto arterioso permeable
1995,1037,Tetralogia de Fallot TOF -Sindrome de valvula ...
1996,1037,Estenosis de la arteria pulmonar
1997,1037,Comunicacion interventricular


In [108]:
# add column name "categories"
df ["categories"] = df ["diagnosis_main"]
df

Unnamed: 0,patient_id,diagnosis_main,categories
0,0,Comunicacion interauricular secundum,Comunicacion interauricular secundum
1,1,Comunicacion interauricular secundum,Comunicacion interauricular secundum
2,2,Comunicacion interauricular secundum,Comunicacion interauricular secundum
3,3,Comunicacion interauricular secundum,Comunicacion interauricular secundum
4,4,Comunicacion interauricular secundum,Comunicacion interauricular secundum
...,...,...,...
1994,1035,Conducto arterioso permeable,Conducto arterioso permeable
1995,1037,Tetralogia de Fallot TOF -Sindrome de valvula ...,Tetralogia de Fallot TOF -Sindrome de valvula ...
1996,1037,Estenosis de la arteria pulmonar,Estenosis de la arteria pulmonar
1997,1037,Comunicacion interventricular,Comunicacion interventricular


In [109]:
# check the number of patients of each diagnosis
df["categories"].value_counts()

Comunicacion interauricular secundum                                            127
Comunicacion interventricular membranosa                                        115
Persistencia de conducto arterioso                                              108
PCA                                                                              76
Tetralogia de Fallot estenosis pulmonar                                          47
                                                                               ... 
Conducto arterioso pequeno permeable                                              1
Antecedente de comunicacion interauricular pequena cerro de forma espontanea      1
Comunicacion interventricular perimembranosa de 3x5 mm                            1
PO de bandaje de AP                                                               1
Estenosis de la arteria pulmonar                                                  1
Name: categories, Length: 827, dtype: int64

In [110]:
# dictionary of categories and regex patterns
reduced_diagnosis = {
    "PCA" : [
        ".*^[Pp]ersistencia[d]* de[l]* [cC]onducto [Aa]rterio[sr]o.*",
        ".*^Persistencia del ductus arterioso muy pequeno.*",
        ".*^PCA.*",
        ".*[Cc]onducto.*"
    ],
    "Tetralogia de Fallot" : [
        ".*^Tetralogia.*",
        ".*^Fallot.*",
        ".*^TOF.*",
        ".*Historia.*?Fallot"
    ],
    "CIV" : [
        ".*^[cC]omunicacion interventricular.*",
        ".*^[cC]omunicacion ventricular.*",
        ".*^[cC]omunicacion inte*rventricular.*",
        ".*^[cC]omun.*interv.*",
        ".*^CIV.*"
    ],
    "CIA" : [
        ".*^[cC]omunicacion interauricular.*",
        ".*^[cC]omunicacion auricular.*",
        ".*^[cC]omunicacion intera.*",
        ".*^[cC]omun.* intera.*",
        ".*^CIA.*"
    ],
    "CAV" :[
        ".*^Canal AV.*",
        ".*^Canal auriculo ventricular.*",
        ".*^Canal [aA].+r.*"
    ],
    "Corazon Univentricular" :[
        ".*Corazon.+[uU]niventricular.*",
        ".*^[vV]entriculo unico.*"
    ],
    "Estenosis" : [
        ".*^[eE]stenosis.*"
    ],
    "Coartacion Aortica" : [
        ".*^[cC]oartacion.*",
        "coartacion aortica",
        "coartacion"
    ],
    "Hipoplasia" : [
        ".*^Hipoplasia.*",
        ".*hipoplasico.*"
    ],
    "Atresia" : [
        ".*^[aA]tresia.*"
    ],
    "Conexion Anomala" : [
        ".*^[Cc]onexion.*"
    ],
    "Doble Salida VD" : [
        ".*^[dD]oble [Ss]alida.*",
        ".*^[dD]oble [Vv]ia de [Ss]alida.*"
    ],
    "Post-Surgical Procedure" : [
        ".*post\-.*",
        ".*[Hh]istoria.*"
    ]
}

In [111]:
# check the name of the rows and change it with the dictionary
for category in reduced_diagnosis :
    for pattern in reduced_diagnosis[category] :
        df['categories'] = df['categories'].str.replace(pattern, category)

  after removing the cwd from sys.path.


In [112]:
# verify the number of columns
df['categories'].value_counts()

CIV                                                321
CIA                                                246
PCA                                                241
Estenosis                                          114
Tetralogia de Fallot                               100
                                                  ... 
Cardiomegalia                                        1
Insuficiencia pulmonar severa                        1
Tetralogia de Fallot con parche  no transanular      1
Aorta ascendente con hipoplasia moderada             1
Hipoplasia\nCoartacion Aortica                       1
Name: categories, Length: 345, dtype: int64

In [113]:
# variable to verify the number of rows repeated
check = df['categories'].str.findall(".*[Hh]istoria.*")
check = check[check.str.len()>0] # fix to avoid the error message
check.value_counts()

Series([], Name: categories, dtype: int64)

In [114]:
# store the data without pre-processing (no others)
df.dropna(inplace=True)
df.to_csv("../../database/clean3/main_diagnosis_categories.csv", index=False)

In [115]:
# check the data distribution
THRESHOLD = 0.975
low, high = kardiasclean.perform_frequency_split_quantile(df['categories'], THRESHOLD)
print(kardiasclean.evaluate_distribution(high, low))

check_df = df.copy(deep=True)
check_df['categories'] = kardiasclean.perform_binning_quantile(check_df['categories'], THRESHOLD)
# check_df.to_csv("../../database/clean3/other_final_main_diag.csv", index=False)
check_df.head()

Total data (repeated): 1997
--------------------
Unique high frequency data: 9
Unique low frequency data: 336
--------------------
Total high frequency data: 1331
Total low frequency data: 666
--------------------
Percentage of high data: 66.65%
Percentage of low data: 33.35%
--------------------
Summary
--------------------
From 1997 data, 9 account for 66.65% of the total count.



Unnamed: 0,patient_id,diagnosis_main,categories
0,0,Comunicacion interauricular secundum,CIA
1,1,Comunicacion interauricular secundum,CIA
2,2,Comunicacion interauricular secundum,CIA
3,3,Comunicacion interauricular secundum,CIA
4,4,Comunicacion interauricular secundum,CIA
