In [None]:
import pandas as pd
import numpy as np

# Import pre-processed dataset
df_full = pd.read_csv("../pre_process/preprocessed_data_before_dataset_split.csv")

df_full.drop(columns=['NuliP', 'PastPTB', 'PastCSAny', 'GDM', 'AnyPGDM', 'Previa', 'AnyPETNir', 'AnyPETPIH', "Sex", "Sex2"], inplace=True)

In [None]:
# Create feature list

# List CL binned features
cl_bins_features = ['cl-ga<16', 'cl-ga16-18', 'cl-ga18-20', 'cl-ga20-22', 'cl-ga22-24', 'cl-ga24-28', 'cl-ga>28']
cl_bins_features_except_18_20 = ['cl-ga<16', 'cl-ga16-18', 'cl-ga20-22', 'cl-ga22-24', 'cl-ga24-28', 'cl-ga>28']
cl_bins_features_except_ga1628 = ['cl-ga<16', 'cl-ga>28']
cl_bins_features_except_ga1824 = ['cl-ga<16', 'cl-ga16-18', 'cl-ga24-28', 'cl-ga>28']



# List Global features
global_features = ["CL", "CL_max", "CL_min", "GA_max", "GA_min", "GAatExam", "Min_Max_CL_Diff", "Min_Max_GA_Diff", "CL_slope"]

In [None]:
def nan_in_ds(df):
    # Missing Values in each Column
    print("Number of NaN in each Column")
    for col_name in df.columns:
        print(col_name + ":" + str(df[col_name].isna().sum()))
    return

def dataset_info(dataset):

    print(f"Features in Dataset: {list(dataset.columns)}" )

    print(f"Number if PTB < 37 is 1: {len(dataset.loc[dataset["PTB37"] == 1])}, is 0: {len(dataset.loc[dataset["PTB37"] == 0])}")
    print(
        f"Number if sPTB < 37 is 1: {len(dataset.loc[(dataset['PTB37'] == 1) & (dataset['SpontaneousPTB'] == 1)])}, " +
        f"is 0: {len(dataset.loc[(dataset['PTB37'] == 0) | (dataset['SpontaneousPTB'] == 0)])}"
    )
    print(
        f"Number if sPTB < 34 is 1: {len(dataset.loc[(dataset['PTB34'] == 1) & (dataset['SpontaneousPTB'] == 1)])}, " +
        f"is 0: {len(dataset.loc[(dataset['PTB34'] == 0) | (dataset['SpontaneousPTB'] == 0)])}"
    )
    print(
        f"Number if sPTB < 32 is 1: {len(dataset.loc[(dataset['PTB32'] == 1) & (dataset['SpontaneousPTB'] == 1)])}, " +
        f"is 0: {len(dataset.loc[(dataset['PTB32'] == 0) | (dataset['SpontaneousPTB'] == 0)])}"
    )

    nan_in_ds(dataset)

    return

In [None]:
# Creating the global dataset

df_global = df_full.copy()

df_global.drop(columns=cl_bins_features, inplace=True)
df_global.drop(columns=['Single_Measurement'], inplace=True)


df_global = df_global.round(decimals=1)

dataset_info(df_global)



df_global.to_csv("../datasets/Global_Dataset.csv")

In [None]:
# Creating the Single dataset

df_single = df_full.copy()

# Here we need to only have exams that is most common to what is used clinically, which is a single CL
# Measurement in the GA period of 18-20 wks

df_single.drop(columns=global_features, inplace=True)
df_single.drop(columns=['Single_Measurement'], inplace=True)

df_single.drop(columns=cl_bins_features_except_18_20, inplace=True)

df_single['cl-ga18-20'] = df_single['cl-ga18-20'].replace(0, np.nan)

print(f"Number of empty bins in cl-ga-18-20: {df_single['cl-ga18-20'].isna().sum()}")

df_single = df_single.round(decimals=1)
dataset_info(df_single)

# Dropping the number of NaNs
df_single.dropna(subset=['cl-ga18-20'], inplace=True)


df_single.to_csv("../datasets/Single_Dataset.csv")



In [None]:
df_ga1628 = df_full.copy()

df_ga1628.drop(columns=global_features, inplace=True)

cl_bins = ['cl-ga16-18', 'cl-ga18-20', 'cl-ga20-22', 'cl-ga22-24', 'cl-ga24-28']

#Drop all single instances
df_ga1628 = df_ga1628[df_ga1628['Single_Measurement'] != 1]
df_ga1628.drop(columns=['Single_Measurement'], inplace=True)

df_ga1628.drop(columns=cl_bins_features_except_ga1628, inplace=True)

df_ga1628[cl_bins] = df_ga1628[cl_bins].replace(0, np.nan)
# dataset_info(df_ga1628)

df_ga1628.dropna(subset=cl_bins, thresh=2, inplace=True)

subset = df_ga1628[cl_bins]
print(subset.head())
subset = subset.rename(
    columns={'cl-ga16-18': 17, 'cl-ga18-20': 19, 'cl-ga20-22': 21, 'cl-ga22-24': 23,
                'cl-ga24-28': 26})
# print(subset.columns)
subset.interpolate(method='linear', order=2, limit_area='inside', axis=1, inplace=True, s=2)
subset[subset < 0] = 0
print(subset.head())
df_ga1628[cl_bins] = subset
df_ga1628 = df_ga1628.round(decimals=1)
print(df_ga1628[cl_bins].head())

df_ga1628.dropna(subset=['cl-ga16-18', 'cl-ga18-20', 'cl-ga20-22', 'cl-ga22-24', 'cl-ga24-28'], inplace=True)

dataset_info(df_ga1628)

df_ga1628.to_csv("../datasets/GA1628_Dataset.csv")


In [None]:
df_ga1824 = df_full.copy()

df_ga1824.drop(columns=global_features, inplace=True)

cl_bins = ['cl-ga18-20', 'cl-ga20-22', 'cl-ga22-24']

#Drop all single instances
df_ga1824 = df_ga1824[df_ga1824['Single_Measurement'] != 1]
df_ga1824.drop(columns=['Single_Measurement'], inplace=True)

df_ga1824.drop(columns=cl_bins_features_except_ga1824, inplace=True)

df_ga1824[cl_bins] = df_ga1824[cl_bins].replace(0, np.nan)
dataset_info(df_ga1824)

df_ga1824.dropna(subset=cl_bins, thresh=2, inplace=True)

subset = df_ga1824[cl_bins]
print(subset.head())
subset = subset.rename(
    columns={'cl-ga18-20': 19, 'cl-ga20-22': 21, 'cl-ga22-24': 23})
# print(subset.columns)
subset.interpolate(method='linear', order=2, limit_area='inside', axis=1, inplace=True, s=2)
subset[subset < 0] = 0
print(subset.head())
df_ga1824[cl_bins] = subset
df_ga1824 = df_ga1824.round(decimals=1)
print(df_ga1824[cl_bins].head())

df_ga1824.dropna(subset=['cl-ga18-20', 'cl-ga20-22', 'cl-ga22-24'], inplace=True)

dataset_info(df_ga1824)

df_ga1824.to_csv("../datasets/GA1824_Dataset.csv")


