In [None]:
import pandas as pd

# Load Cleaned Dataset CSV
df_clean = pd.read_csv("../../cl-fixed-exclusion-nan-drop.csv")

# Making Chorionicity a Binary Label
df_clean.loc[df_clean["Chorionicity"] == "DCDA", "Chorionicity"] = 0
df_clean.loc[df_clean["Chorionicity"] == "MCDA", "Chorionicity"] = 1

# Making Sex a binary label, if M = 0, F = 1, Unknown = 2
mask_blank = df_clean["Sex"].isnull() | df_clean["Sex2"].isnull()

df_clean.loc[mask_blank, "Sex"] = 2
df_clean.loc[mask_blank, "Sex2"] = 2

df_clean.loc[df_clean["Sex"] == "M", "Sex"] = 0
df_clean.loc[df_clean["Sex"] == "F", "Sex"] = 1

df_clean.loc[df_clean["Sex2"] == "M", "Sex2"] = 0
df_clean.loc[df_clean["Sex2"] == "F", "Sex2"] = 1


# Drop all features not needed 
drop_list = ["MRN-ANON", "ExamNoInPreg", "NUS-ANON", "MRNYEAR-ANON", "CongenitalAnomaliesNirEmpty0",
                 "Any_IUFD", "IsCerclage", "Exam Started Date", "Year", "Nobtv",
                 "DeliveryDate", "CS", "BirthWeight", "HadlockCentileLess10p", "Apgar5Less7",
                 "isResuscitation", "IsNICU", "BirthWeight2", "Apgar5Less72", "isResuscitation2", "IsNICU2", "GAatDelivery",
                 "BMICurrent"]
df_clean = df_clean.drop(columns=drop_list)

def nan_in_ds(df):
    # Missing Values in each Column
    print("Number of NaN in each Column")
    for col_name in df.columns:
        print(col_name + ":" + str(df[col_name].isna().sum()))
    return

for col in df_clean.columns:
    type_counts = df_clean[col].apply(lambda x: type(x)).value_counts()
    print(f"{type_counts}")


print(f"Number of Pregnancies in NaN drop Dataset: {df_clean["PREGNANCY-ID-ANON"].nunique()}, Number of Exams: {len(df_clean)}")

nan_in_ds(df_clean)




In [None]:
# EFW processing

import numpy as np

def estimated_fetal_weight(features, twin):
    # where features is a list containing BPD, HC, AC, FL
    bpd_str = "BPD" + twin
    hc_str = "HC" + twin
    ac_str = "AC" + twin
    fl_str = "FL" + twin
    bpd, hc, ac, fl = features[bpd_str], features[hc_str], features[ac_str], features[fl_str]
    efw = 1.3596 - (0.00396 * ac * fl) + (0.0064 * hc) + (0.00061 * bpd * ac) + (0.0424 * ac) + (0.174 * fl)
    efw = pow(10, efw)
    return int(efw)


df = df_clean

# Applying estimated fetal weight formula if all values are there
mask = (df["EFW1"].isna()) & (df["BPD1"].notna()) & (df["HC1"].notna()) & (df["AC1"].notna()) & (df["FL1"].notna())
df.loc[mask, "EFW1"] = df.loc[mask].apply(lambda x: estimated_fetal_weight(x, "1"), axis=1)
mask = (df["EFW2"].isna()) & (df["BPD2"].notna()) & (df["HC2"].notna()) & (df["AC2"].notna()) & (df["FL2"].notna())
df.loc[mask, "EFW2"] = df.loc[mask].apply(lambda x: estimated_fetal_weight(x, "2"), axis=1)

# drop the 42 CL exams
df.dropna(subset="CL")


nan_in_ds(df)



In [None]:
# Aggregate Data

# Create new feature labels

# For Min Max Dataset
df["CL_max"] = None
df["CL_min"] = None
df["GA_max"] = 0.0
df["GA_min"] = 0.0
df["Min_Max_CL_Diff"] = 0.0
df["Min_Max_GA_Diff"] = 0.0
df["Single_Measurement"] = 0 # 0 if no 1 if yes
df["CL_slope"] = 0.0
# For Binning Dataset
df["cl-ga<16"] = 0.0
df["cl-ga16-18"] = 0.0
df["cl-ga18-20"] = 0.0
df["cl-ga20-22"] = 0.0
df["cl-ga22-24"] = 0.0
df["cl-ga24-28"] = 0.0
df["cl-ga>28"] = 0.0

for unique in df["PREGNANCY-ID-ANON"].unique():
    mask = df["PREGNANCY-ID-ANON"] == unique

    df.loc[(df["GAatExam"] <= 16) & mask, "cl-ga<16"] \
        = df.loc[(df["GAatExam"] <= 16) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 16) & (df["GAatExam"] <= 18) & mask, "cl-ga16-18"] \
        = df.loc[(df["GAatExam"] > 16) & (df["GAatExam"] <= 18) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 18) & (df["GAatExam"] <= 20) & mask, "cl-ga18-20"] \
        = df.loc[(df["GAatExam"] > 18) & (df["GAatExam"] <= 20) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 20) & (df["GAatExam"] <= 22) & mask, "cl-ga20-22"] \
        = df.loc[(df["GAatExam"] > 20) & (df["GAatExam"] <= 22) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 22) & (df["GAatExam"] <= 24) & mask, "cl-ga22-24"] \
        = df.loc[(df["GAatExam"] > 22) & (df["GAatExam"] <= 24) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 24) & (df["GAatExam"] <= 28) & mask, "cl-ga24-28"] \
        = df.loc[(df["GAatExam"] > 24) & (df["GAatExam"] <= 28) & mask, "CL"].mean()

    df.loc[(df["GAatExam"] > 28) & mask, "cl-ga>28"] \
        = df.loc[(df["GAatExam"] > 28) & mask, "CL"].mean()

    # Calculate min and max for CL and GAatExam for this patient
    cl_max = df.loc[mask, "CL"].max()
    cl_min = df.loc[mask, "CL"].min()
    ga_max = df.loc[mask & (df["CL"] == cl_max), "GAatExam"].max()
    ga_min = df.loc[mask & (df["CL"] == cl_min), "GAatExam"].min()

    df.loc[mask, "CL_max"] = cl_max
    df.loc[mask, "CL_min"] = cl_min
    df.loc[mask, "GA_max"] = ga_max
    df.loc[mask, "GA_min"] = ga_min

    # Calculate the differences between max and min for CL and GAatExam
    cl_diff = cl_max - cl_min
    ga_diff = ga_max - ga_min
    df.loc[mask, "Min_Max_CL_Diff"] = cl_diff
    df.loc[mask, "Min_Max_GA_Diff"] = ga_diff

    # Check for single measurement; if the count of rows is 1, set flag to 1
    if mask.sum() == 1:
        df.loc[mask, "Single_Measurement"] = 1
    else:
        df.loc[mask, "Single_Measurement"] = 0

    # Calculate CL_slope only if it's not a single measurement and ga_diff is not zero
    if mask.sum() > 1 and ga_diff != 0:
        slope = cl_diff / ga_diff
    else:
        slope = 0.0
    df.loc[mask, "CL_slope"] = slope

    



In [None]:
# set preg id as index
df.set_index("PREGNANCY-ID-ANON", inplace=True)


aggregation_criteria = {'CL': ['mean'],
                        'CLwithFundalPressure': ['mean'],
                        'CL_max': ['mean'],
                        'CL_min': ['mean'],
                        'GAatExam': ['mean'],
                        'GA_max': ['mean'],
                        'GA_min': ['mean'],
                        'Min_Max_CL_Diff': ['mean'],
                        'Min_Max_GA_Diff': ['mean'],
                        'CL_slope': ['mean'],
                        'EFW1': ['mean'],
                        'EFW2': ['mean'],
                        "BPD1": ['mean'],
                        "BPD2":['mean'],
                        "HC1":['mean'],
                        "HC2":['mean'],
                        "AC1":['mean'],
                        "AC2":['mean'],
                        "FL1":['mean'],
                        "FL2":['mean'],
                        'Age': ['max'],
                        'Sex': ['max'],
                        'Sex2': ['max'],
                        'BMIprePreg': ['mean'],
                        'Chorionicity': ['max'],
                        'NuliP': ['max'],
                        'PastPTB': ['max'],
                        'PastCSAny': ['max'],
                        'GDM': ['max'],
                        'AnyPGDM': ['max'],
                        'Previa': ['max'],
                        'AnyPETNir': ['max'],
                        'AnyPETPIH': ['max'],
                        'cl-ga<16': ['max'],
                        'cl-ga16-18': ['max'],
                        'cl-ga18-20': ['max'],
                        'cl-ga20-22': ['max'],
                        'cl-ga22-24': ['max'],
                        'cl-ga24-28': ['max'],
                        'cl-ga>28': ['max'],
                        'Single_Measurement' : ['max'],
                        'PTB37': ['max'],
                        'PTB34': ['max'],
                        'PTB32': ['max'],
                        'SpontaneousPTB': ['max']
                        }

dtype_criteria =    {'CL': "float64",
                    'CLwithFundalPressure': "float64",
                    'CL_max': "float64",
                    'CL_min': "float64",
                    'GAatExam': "float64",
                    'GA_max': "float64",
                    'GA_min': "float64",
                    'Min_Max_CL_Diff': "float64",
                    'Min_Max_GA_Diff': "float64",
                    'CL_slope': "float64",
                    'EFW1': "float64",
                    'EFW2': "float64",
                    "BPD1": "float64",
                    "BPD2":"float64",
                    "HC1":"float64",
                    "HC2":"float64",
                    "AC1":"float64",
                    "AC2":"float64",
                    "FL1":"float64",
                    "FL2":"float64",
                    'Age': "int64",
                    'Sex': "int64",
                    "Sex2": "int64",
                    'BMIprePreg': "float64",
                    'Chorionicity': "int64",
                    'NuliP': "int64",
                    'PastPTB': "int64",
                    'PastCSAny': "int64",
                    'GDM': "int64",
                    'AnyPGDM': "int64",
                    'Previa': "int64",
                    'AnyPETNir': "int64",
                    'AnyPETPIH': "int64",
                    'cl-ga<16': "float64",
                    'cl-ga16-18': "float64",
                    'cl-ga18-20': "float64",
                    'cl-ga20-22': "float64",
                    'cl-ga22-24': "float64",
                    'cl-ga24-28': "float64",
                    'cl-ga>28': "float64",
                    'Single_Measurement': "int64",
                    'PTB37': "int64",
                    'PTB34': "int64",
                    'PTB32': "int64",
                    'SpontaneousPTB': "int64"
                    }

df = df.groupby("PREGNANCY-ID-ANON").agg(aggregation_criteria)
df = df.droplevel(1, axis=1)

# Set all categorial variables to int
df = df.astype(dtype_criteria)

print(f"Number of Pregnancies in Aggregated Dataset {len(df)}")
nan_in_ds(df)

# Drop fundal pressure
df = df.drop(columns="CLwithFundalPressure")

# Average out the bmi pre pregnancy, and fetal meaurements.
df.fillna({
    "EFW1": df["EFW1"].mean(),
    "EFW2": df["EFW2"].mean(),
    "BPD1": df["BPD1"].mean(),
    "BPD2": df["BPD2"].mean(),
    "HC1": df["HC1"].mean(),
    "HC2": df["HC2"].mean(),
    "AC1": df["AC1"].mean(),
    "AC2": df["AC2"].mean(),
    "FL1": df["FL1"].mean(),
    "FL2": df["FL2"].mean(),
    "BMIprePreg": df["BMIprePreg"].mean(),

}, inplace=True)

print(f"Number of Pregnancies in Aggregated Dataset {len(df)}")
nan_in_ds(df)

df.to_csv("./preprocessed_data_before_dataset_split.csv")