In [1]:
import os

import pandas as pd

DATA_DIR = "../data"

In [2]:
os.listdir(os.path.join(DATA_DIR, "raw"))

['all_files',
 'full',
 'hfpef_confirmed_HF.csv',
 'hfref_confirmed_HF.csv',
 'lvef_pred.csv',
 'mortality_individual.csv',
 'patient_level_dp.csv',
 'surv_LVEF.csv']

In [3]:
df = pd.read_csv(os.path.join(DATA_DIR, "processed", "classification.csv"))

In [4]:
df.dtypes

patid                 int64
summary_Sym_DAR      object
summary_Sym_Ort      object
summary_Sym_TAR      object
summary_Sym_DWSU     object
                     ...   
summary_Blo_EGFR    float64
HF_type              object
death_2_Y           float64
death_5_Y           float64
death_10_Y          float64
Length: 82, dtype: object

In [7]:
healthy_days_in_db = 3000

df["high_risk"] = pd.Series()
df.loc[df["days_to_event"].lt(90) & df["death_patient"].eq(1), "high_risk"] = 1
df.loc[df["days_to_event"].gt(healthy_days_in_db) & df["death_patient"].eq(0),  "high_risk"] = 0


In [8]:
df.groupby("high_risk")["Phy_Age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
high_risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,13476.0,67.793108,10.769262,27.43,60.45,68.79,75.88,100.31
1,7093.0,83.253433,9.355876,46.03,77.8,84.55,89.98,105.73


In [10]:
df['age_bin'] = pd.cut(df['Phy_Age'], bins=[0,40,50, 55, 60, 65,70, 75, 80, 85, 90,100])

In [11]:
df.groupby("age_bin").size()

  df.groupby("age_bin").size()


age_bin
(0, 40]        125
(40, 50]      2607
(50, 55]      4002
(55, 60]      6482
(60, 65]     10346
(65, 70]     15164
(70, 75]     22086
(75, 80]     28806
(80, 85]     30594
(85, 90]     25052
(90, 100]    15448
dtype: int64

In [12]:
from sklearn.utils import resample

balanced = []

for b in df['age_bin'].unique():
    subset = df[df['age_bin'] == b]
    
    if subset['high_risk'].nunique() < 2:
        continue
    
    class0 = subset[subset['high_risk'] == 0]
    class1 = subset[subset['high_risk'] == 1]
    
    n = min(len(class0), len(class1))
    
    balanced.append(resample(class0, n_samples=n, random_state=42))
    balanced.append(resample(class1, n_samples=n, random_state=42))

df_balanced = pd.concat(balanced).drop(columns='age_bin')

In [20]:
df_balanced.groupby("high_risk")["Phy_Sex"].value_counts()

high_risk  Phy_Sex
0          Female     2232
           Male       1730
1          Male       2197
           Female     1765
Name: count, dtype: int64

In [22]:
(
    df_balanced.set_index("patid")
        .to_csv(os.path.join(DATA_DIR, "processed", "high_risk_balanced.csv"))
)