In [890]:
from utils.imputation import *
from utils.bpPercentile import *
from utils.percentilesStatage import *
import re

In [891]:
family_start = pd.read_csv('../data/adj_family_start.csv')

In [892]:
family_start = family_start.rename(
    columns={'c1ylength': 'c1yheight', 'c2ylength': 'c2yheight', 'c3ylength': 'c3yheight'})

In [893]:
family_start = family_start.reset_index(drop=True)
family_start['id'] = family_start.index + 1

In [894]:
FS_time = family_start.drop(columns=['c0_3yBMIAUC', 'c0_5yBMIAUC', 'c5yBMIz_who'])
FS_time.shape

(997, 38)

In [895]:
df = FS_time

# Identify identifier and time-varying columns
identifier_cols = ['id']
time_varying_cols = [col for col in df.columns if col.startswith('c')]

# Rename columns to 'quantity_time' format
new_col_names = {}
pattern = re.compile(r'c(\d+)y(\w+)')

for col in time_varying_cols:
    match = pattern.match(col)
    if match:
        time = match.group(1)  # e.g., '25'
        quantity = match.group(2)  # e.g., 'height'
        new_col_names[col] = f"{quantity}_{time}"
    else:
        # If the column doesn't match the pattern, keep it as is
        new_col_names[col] = col

df_renamed = df.rename(columns=new_col_names)

#Use wide_to_long to reshape
stubnames = ['age', 'height', 'SBP', 'DBP', 'bmi']
df_long = pd.wide_to_long(df_renamed, stubnames=stubnames, i='id', j='time', sep='_', suffix='\d+')

# Reset index to flatten the DataFrame
df_long = df_long.reset_index()


In [904]:
FS_1 = df_long[df_long['time'] == 1]
FS_2 = df_long[df_long['time'] == 2]
FS_3 = df_long[df_long['time'] == 3]
FS_5 = df_long[df_long['time'] == 5]

# Generate Time-Based Datasets

## 1 Year

In [None]:
FS_1["Height Z-Score"] = FS_1.apply(findHeightZ, axis=1)

FS_1["Expected SBP"] = FS_1.apply(expectedSBP, axis=1)
FS_1["SBP Z Score"] = FS_1.apply(zscoreSBP, axis=1)
FS_1["SBP Percentile"] = FS_1.apply(percentileSBP, axis=1)

FS_1["Expected DBP"] = FS_1.apply(expectedDBP, axis=1)
FS_1["DBP Z Score"] = FS_1.apply(zscoreDBP, axis=1)
FS_1["DBP Percentile"] = FS_1.apply(percentileDBP, axis=1)



In [None]:
X1 = FS_1.copy()
X1 = FS_1.drop(columns=["id", "SBP", "DBP", "Height Z-Score", "Expected DBP", "DBP Z Score", "SBP Z Score", "SBP Percentile", "Expected SBP", "SBP Z Score", "time", "DBP Percentile"])

In [None]:
hypertension_threshold = 94.5

cond1 = FS_1["SBP Percentile"] > hypertension_threshold
cond2 = FS_1["DBP Percentile"] > hypertension_threshold
Y1 = (cond1 | cond2).astype(int)

In [None]:
X1.to_csv("../data/X1yr.csv", index=False)
Y1.to_csv("../data/Y1yr.csv", index=False)

## 5 Year

In [906]:
FS_5["Height Z-Score"] = FS_5.apply(findHeightZ, axis=1)

In [907]:
FS_5["Expected SBP"] = FS_5.apply(expectedSBP, axis=1)
FS_5["SBP Z Score"] = FS_5.apply(zscoreSBP, axis=1)
FS_5["SBP Percentile"] = FS_5.apply(percentileSBP, axis=1)

In [908]:
FS_5["Expected DBP"] = FS_5.apply(expectedDBP, axis=1)
FS_5["DBP Z Score"] = FS_5.apply(zscoreDBP, axis=1)
FS_5["DBP Percentile"] = FS_5.apply(percentileDBP, axis=1)

In [900]:
X5 = FS_5.copy()
X5 = FS_5.drop(columns=["id", "SBP", "DBP", "Height Z-Score", "Expected DBP", "DBP Z Score", "SBP Z Score", "SBP Percentile", "Expected SBP", "SBP Z Score", "time", "DBP Percentile"])


In [901]:
hypertension_threshold = 94.5

cond1 = FS_5["SBP Percentile"] > hypertension_threshold
cond2 = FS_5["DBP Percentile"] > hypertension_threshold
Y5 = (cond1 | cond2).astype(int)


### Convert columns to categorical

In [902]:
columns_to_encode = ['Nethnic_mom', 'csex','everbfed', 'gdm_report', 'mblcvd','mblhdis','msmkhist','priordiabp']
X5[columns_to_encode] = X5[columns_to_encode].astype('category')

### Export Data Files


In [889]:
X5.to_csv("../data/X5yr.csv", index=False)
Y5.to_csv("../data/Y5yr.csv", index=False)