### Importación de librerías necesarias

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Lectura y procesamiento de datos del dataset

In [2]:
df_data = pd.read_csv("gdm_first_trimester_ml_dataset.csv")
df_data.head(5)

Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,triglycerides_mmol_l,hdl_mmol_l,parity,family_history_t2d,previous_gdm,pcos,smoking_first_trimester,physical_activity_level,diet_score_0_100,label_gdm
0,26.8,26.9,119.0,57.0,77.7,8.7,4.1,5.33,7.8,1.42,1.37,1.36,0,0,1,0,1,2.0,62.0,0
1,22.6,27.3,,69.0,80.8,9.0,3.53,5.73,7.2,1.12,1.13,1.6,0,1,0,1,0,1.0,42.0,0
2,29.9,33.1,103.0,84.0,89.9,10.3,4.45,,4.3,0.86,1.27,1.16,0,1,0,0,0,1.0,72.0,0
3,26.3,23.6,112.0,69.0,83.3,12.2,4.79,4.69,13.7,2.91,1.22,1.73,0,0,0,0,0,0.0,81.0,0
4,31.9,31.0,117.0,69.0,84.9,8.9,,5.12,9.5,2.23,2.45,1.5,1,0,0,0,0,2.0,53.0,0


In [None]:
df_data.shape  # La salida se interpreta como que hay 1500 ejemplos, 19 descriptores y 1 columna asociada a la etiqueta.

(1500, 20)

In [None]:
df_data["label_gdm"].value_counts()  # Los datos tristemente no están equilibrados

label_gdm
0    1239
1     261
Name: count, dtype: int64

In [9]:
df_data.dtypes

age_years                  float64
bmi_prepreg_kg_m2          float64
systolic_bp_mmHg           float64
diastolic_bp_mmHg          float64
map_mmHg                   float64
gestational_weeks          float64
fpg_mmol_l                 float64
hba1c_percent              float64
insulin_uIU_ml             float64
homa_ir                    float64
triglycerides_mmol_l       float64
hdl_mmol_l                 float64
parity                       int64
family_history_t2d           int64
previous_gdm                 int64
pcos                         int64
smoking_first_trimester      int64
physical_activity_level    float64
diet_score_0_100           float64
label_gdm                    int64
dtype: object

In [14]:
df_nulls = df_data.isna().astype(int)
# df_nulls  # Da una matriz con ceros y unos, 1 si el dato es nulo (no válido) y 0 si es no nulo (válido)

null_data = []

for column in df_nulls.columns:
    counts = df_nulls[column].value_counts()

    row = [column, 0, 0]

    if 1 in counts.index:
        row[1] = counts[1]

    if 0 in counts.index:
        row[2] = counts[0]

    null_data.append(row)

df_summary_null = pd.DataFrame(data=null_data, columns=["descriptor", "count_null", "count_no_null"] )
df_summary_null

Unnamed: 0,descriptor,count_null,count_no_null
0,age_years,0,1500
1,bmi_prepreg_kg_m2,0,1500
2,systolic_bp_mmHg,45,1455
3,diastolic_bp_mmHg,36,1464
4,map_mmHg,0,1500
5,gestational_weeks,0,1500
6,fpg_mmol_l,109,1391
7,hba1c_percent,162,1338
8,insulin_uIU_ml,108,1392
9,homa_ir,0,1500


### Estadística descriptiva (aquí se eliminará lo categórico)

In [17]:
columns_to_ignore = ["parity", "family_history_t2d", "previous_gdm", "pcos", "smoking_first_trimester", "label_gdm"]

def get_range_outlier(q1, q3, IQR, factor_value:float=1.5):
    min_value = q1 - IQR*factor_value
    max_value = q3 + IQR*factor_value
    return min_value, max_value

In [20]:
statistical_descriptors = []

for column in df_data.columns:
    if column not in columns_to_ignore:
        descriptive_values = df_data[column].describe()

        q1 = descriptive_values["25%"]
        q3 = descriptive_values["75%"]
        IQR = q3 - q1
        
        min_value, max_value = get_range_outlier(q1, q3, IQR)

        row = {
                "descriptor": column, 
                "mean": descriptive_values["mean"], 
                "std": descriptive_values["std"],
                "median": descriptive_values["50%"], 
                "IQR": IQR, 
                "25%": q1, 
                "75%": q3, 
                "min_value_for_outlier": min_value, 
                "max_value_for_outlier": max_value,
            }

        statistical_descriptors.append(row)

In [21]:
df_statistical = pd.DataFrame(statistical_descriptors)
df_statistical

Unnamed: 0,descriptor,mean,std,median,IQR,25%,75%,min_value_for_outlier,max_value_for_outlier
0,age_years,29.1296,4.988332,29.1,6.925,25.6,32.525,15.2125,42.9125
1,bmi_prepreg_kg_m2,26.284738,5.00536,26.35,6.5,22.9,29.4,13.15,39.15
2,systolic_bp_mmHg,112.029553,12.254459,112.0,16.0,104.0,120.0,80.0,144.0
3,diastolic_bp_mmHg,70.535519,8.516566,70.0,11.0,65.0,76.0,48.5,92.5
4,map_mmHg,84.398244,7.215258,84.3,9.2,79.7,88.9,65.9,102.7
5,gestational_weeks,10.811667,1.611753,10.8,2.8,9.4,12.2,5.2,16.4
6,fpg_mmol_l,4.801589,0.977795,4.71,0.85,4.3,5.15,3.025,6.425
7,hba1c_percent,5.218923,0.376506,5.2,0.4,5.0,5.4,4.4,6.0
8,insulin_uIU_ml,10.99052,4.487567,10.2,4.825,8.075,12.9,0.8375,20.1375
9,homa_ir,2.394468,1.683579,2.12,1.08,1.64,2.72,0.02,4.34


#### Detección de outliers

In [23]:
def check_is_outlier(value, min_value, max_value):
    if value < min_value or value > max_value:
        return True
    return False

In [None]:
df_outliers = pd.DataFrame()

for column in df_data.columns:
    if column not in columns_to_ignore:

        df_filter = df_statistical[df_statistical["descriptor"] == column]
        df_filter.reset_index(inplace=True)

        min_value, max_value = df_filter["min_value_for_outlier"][0], df_filter["max_value_for_outlier"][0]

        df_outliers[column] = df_data[column].apply(lambda x: check_is_outlier(x, min_value, max_value))

In [None]:
outliers = df_outliers.astype(int)

def generate_df_counts(df, labels):
    data = pd.DataFrame()



Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,triglycerides_mmol_l,hdl_mmol_l,physical_activity_level,diet_score_0_100
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0,1,1,0,0,0,1,1,1,1,1,0,0,0
1496,0,0,0,1,1,0,0,0,0,0,0,0,0,0
1497,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1498,0,0,0,0,0,0,0,0,0,0,1,0,0,0
