In [1]:
import os
import pandas as pd
import numpy as np
path = "/groups/umcg-lifelines/prm03/projects/ov20_0110/dataset_order_202501/results"

# List contents
print(os.listdir(path))


['1c_q_1_results.csv', '1a_v_1_results.csv', '3a_v_1_results.csv', '2a_q_1_results.csv', '3b_q_1_results.csv', '2a_v_1_results.csv', '3a_q_2_results.csv', '3a_q_mini_results.csv', '1b_q_1_results.csv', '1a_q_1_results.csv', 'global_summary.csv', '2b_q_1_results.csv', '2a_q_2_results.csv', '3a_q_1_results.csv', '1a_q_2_results.csv']


In [11]:
def analyze_missing_values_and_uniques(df):
    # Calculate missing values percentage (only true NaN values)
    missing_values_sum = df.isna().sum()
    total_rows = df.shape[0]
    missing_values_percentage = (missing_values_sum / total_rows) * 100
    print("Original missing values (%):")
    #print(missing_values_percentage.head(50))
    
    # Loop through each column
    for column in df.columns:
        print(f"\n--- Unique values for '{column}' ---")
        
        # Special handling for ID columns
        if column == 'project_pseudo_id':
            num_unique = df[column].nunique()
            print(f"Total unique IDs: {num_unique} (expected 1 per participant)")
            print(f"Duplicates found: {df[column].duplicated().sum()}")
            continue
            
        # Get value counts (including '$7' and other strings)
        value_counts = df[column].value_counts(dropna=False).reset_index()
        value_counts.columns = ['Unique Value', 'Count']
        
        # Calculate percentages safely
        total_count = value_counts['Count'].sum()
        
        # Create percentage column with mixed types
        percentages = []
        for count in value_counts['Count']:
            try:
                pct = (float(count) / total_count) * 100
                percentages.append(f"{pct:.2f}%")
            except (TypeError, ValueError):
                percentages.append("N/A")
        
        value_counts['Percentage'] = percentages
        display(value_counts)
    
    return df


In [3]:
###CVD OUTCOME 

In [1]:

file0 = "2a_q_1_results.csv" 


df = pd.read_csv(f"{path}/{file0}")



# List of variables you want to show
variables_to_show = ['project_pseudo_id', 
    'cvd_followup_adu_q_1', 
    'stroke_followup_adu_q_1', 
    'heartattack_followup_adu_q_1', 
    'claudication_followup_adu_q_1', 
  
]


# Filter the dataframe to show only these variables
D0 = df[variables_to_show]


# Define CVD variables
cvd_variables = [
    'cvd_followup_adu_q_1', 
    'stroke_followup_adu_q_1', 
    'heartattack_followup_adu_q_1',
    'claudication_followup_adu_q_1',
   
]

# Convert conditions to numpy arrays explicitly
has_cvd = (D0[cvd_variables] == '1').any(axis=1).to_numpy()
all_missing = (D0[cvd_variables] == '$7').all(axis=1).to_numpy()

# Create output array with object dtype to hold mixed types
result = np.empty(len(D0), dtype=object)

# Apply conditions
result[has_cvd] = 1
result[all_missing] = '$7'
result[(~has_cvd) & (~all_missing)] = 2

D0['broad_cvd'] = result

# Drop original follow-up columns and keep only project_pseudo_id + broad_cvd
D0 = D0[['project_pseudo_id', 'broad_cvd']]

# Count cases
num_cases = D0[D0['broad_cvd'] == 1].shape[0]
print(f"Number of broad CVD cases: {num_cases}")
D0

# aneurysm_diagnosis_adu_q_1	1	yes	ja
# 1	aneurysm_diagnosis_adu_q_1	2	no	nee
# 2	angioplasty_bypass_adu_q_1	1	yes	ja
# 3	angioplasty_bypass_adu_q_1	2	no	nee
# 4	claudication_followup_adu_q_1	1	yes	ja
# 5	claudication_followup_adu_q_1	2	no	nee

In [2]:
#Physical activity 
file1 = "1a_q_2_results.csv"  # Change to any file you want to inspect

df1 = pd.read_csv(f"{path}/{file1}")
df1.columns

# List of variables you want to show
variables_to_show = ['project_pseudo_id', 'squash_perweek_adu_q_15']

# Filter the dataframe to show only these variables
D1 = df1[variables_to_show]

In [3]:
# Smoking
file2 = "1a_q_2_results.csv"  # Change to any file you want to inspect

df2 = pd.read_csv(f"{path}/{file2}")
df2.columns


# # List of variables you want to show
variables_to_show = [ 'project_pseudo_id', 'current_smoker_adu_c_2',
 'ex_smoker_adu_c_2']

# # Filter the dataframe to show only these variables
D2 = df2[variables_to_show]


D2_explained = analyze_missing_values_and_uniques(D2)

In [4]:
#symptoms of depression and anxiety
file3 = "1a_v_1_results.csv"
df3 = pd.read_csv(f"{path}/{file3}")


# Filter to include ALL MINI variants (v1, v2, v3)
df3_mini = df3[df3['variant_id'].isin(['1a_v_1_mini_18plus_v1', 
                                 '1a_v_1_mini_18plus_v2', 
                                 '1a_v_1_mini_18plus_v3'])]

mini_cols = [
#depression symptoms 
            'mini_a_adu_q_1', #Depressed mood, #have you been consistently depressed or down, most of the day, nearly every day, for the past two weeks?
            'mini_a_adu_q_2', #Anhedonia, in the past two weeks, have you been much less interested in most things or much less able to enjoy the things you used to enjoy most of the time?
            'mini_a_adu_q_3_a',#Appetite changes, #was your appetite decreased or increased nearly every day? did your weight decrease or increase without trying intentionally?
            'mini_a_adu_q_3_b',#Sleep problems #did you have trouble sleeping nearly every night (difficulty falling asleep, waking up in the middle of the night, early morning wakening or sleeping excessively)?
            'mini_a_adu_q_3_c', #Psychomotor changes, did you talk or move more slowly than normal or were you fidgety, restless or having trouble sitting still almost every day?
            'mini_a_adu_q_3_d', #Fatigue, did you feel tired or without energy almost every day?
            'mini_a_adu_q_3_e', #Feelings of inadequacy, did you feel worthless or guilty almost every day?
            'mini_a_adu_q_3_f', #Cognitive problems, did you have difficulty concentrating or making decisions almost every day?
            'mini_a_adu_q_3_g',  #Suicidal ideation, did you repeatedly consider hurting yourself, feel suicidal, or wish that you were dead?


#anxiety symptoms     
         'mini_o_adu_q_1_a', #have you worried excessively or been anxious about several problems of daily life (problems at work, at home or in your close circle) over the past 6 months?
        'mini_o_adu_q_3_a' , # when you were anxious in the past 6 months, did you, most of the time, feel restless, keyed up or on edge?
         'mini_o_adu_q_3_b', #when you were anxious in the past 6 months, did you, most of the time, feel tense?
        'mini_o_adu_q_3_c' , #when you were anxious in the past 6 months, did you, most of the time, feel tired, weak or exhausted easily?
        'mini_o_adu_q_3_d', #when you were anxious in the past 6 months, did you, most of the time, have difficulty concentrating or find your mind going blank?
        'mini_o_adu_q_3_e', #when you were anxious in the past 6 months, did you, most of the time, feel irritable?
        'mini_o_adu_q_3_f' , #when you were anxious in the past 6 months, did you, most of the time, have difficulty sleeping (difficulty falling asleep, waking up in the middle of the night, early morning wakening or sleeping excessively)?               
         
]

variables_to_show = ['project_pseudo_id'] + mini_cols  # Keep ID for reference
D3 = df3_mini[variables_to_show]

def clean_mini_data(df):
    """Clean MINI questionnaire data by standardizing values and handling missing codes"""
    df = df.copy()
    mini_cols = [col for col in df.columns if col.startswith('mini_')]
    
    for col in mini_cols:
        # Convert $4 and similar to NaN
        df[col] = df[col].replace(['$4', '$7', '$'], np.nan)
        
        # Convert all numeric responses to integers
        df[col] = df[col].astype(str).str.extract('(\d+)')[0]
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
    
    return df  # THIS WAS MISSING - NOW RETURNS THE CLEANED DF

def analyze_missing_values_and_uniques(df, max_missing_pct=40):
    """Analyze missing values and unique value distributions, filtering by missingness percentage"""
    analysis_results = []
    
    for column in df.columns:
        # Calculate missing values
        missing_count = df[column].isna().sum()
        total_rows = len(df[column])
        missing_pct = (missing_count / total_rows) * 100
        
        # Skip if missingness is too high
        if missing_pct > max_missing_pct:
            continue
            
        print(f"\n--- {column} ---")
        print(f"Missing: {missing_count} ({missing_pct:.2f}%)")
        
        # Special handling for ID columns - just show summary
        if column == 'project_pseudo_id':
            num_unique = df[column].nunique()
            print(f"Total unique IDs: {num_unique}")
            print(f"Duplicates found: {df[column].duplicated().sum()}")
            analysis_results.append({
                'variable': column,
                'missing_count': missing_count,
                'missing_pct': missing_pct,
                'unique_count': num_unique,
                'duplicates': df[column].duplicated().sum()
            })
            continue
            
        # Get value counts for non-ID columns
        value_counts = df[column].value_counts(dropna=False).sort_index()
        total = value_counts.sum()
        
        print("Value distribution:")
        for val, count in value_counts.items():
            pct = (count / total) * 100
            # Only show values that appear at least 1% of the time or are special codes
            if pct >= 1 or str(val) in ['$4', '$7', '$']:
                print(f"  {val}: {count} ({pct:.2f}%)")
        
        # Store results
        analysis_results.append({
            'variable': column,
            'missing_count': missing_count,
            'missing_pct': missing_pct,
            'value_counts': {k: v for k, v in value_counts.items() 
                           if (v/total)*100 >= 1 or str(k) in ['$4', '$7', '$']}
        })
    
    return pd.DataFrame(analysis_results)

# Clean and analyze with 50% missingness threshold
D3_clean = clean_mini_data(D3)
mini_stats = analyze_missing_values_and_uniques(D3_clean, max_missing_pct=50)

In [5]:
#symptoms of depression and anxiety
file4 = "1a_v_1_results.csv"
df4 = pd.read_csv(f"{path}/{file4}")
df4['variant_id'].unique() 

df4 = df4[df4['variant_id'] == '1a_v_1_atc_18plus_v1']
variables_to_show = [ 'project_pseudo_id', 


 'atc_code_adu_c_1_01',
 'atc_code_adu_c_1_02',
 'atc_code_adu_c_1_03',
 'atc_code_adu_c_1_04',
 'atc_code_adu_c_1_05',
 'atc_code_adu_c_1_06',
 'atc_code_adu_c_1_07',
 'atc_code_adu_c_1_08',
 'atc_code_adu_c_1_09',
 'atc_code_adu_c_1_10',
 'atc_code_adu_c_1_11',
 'atc_code_adu_c_1_12',
 'atc_code_adu_c_1_13',
 'atc_code_adu_c_1_14',
 'atc_code_adu_c_1_15',
 'atc_code_adu_c_1_16',
 'atc_code_adu_c_1_17',
 'atc_code_adu_c_1_18',
 'atc_code_adu_c_1_19',
 'atc_code_adu_c_1_20',
 'atc_code_adu_c_1_21',
 'atc_code_adu_c_1_22',
 'atc_code_adu_c_1_23',
 'atc_code_adu_c_1_24',
 'atc_code_adu_c_1_25',
 'atc_code_adu_c_1_26',
 'atc_code_adu_c_1_27',
 'atc_code_adu_c_1_28',
 'atc_code_adu_c_1_29',
 'atc_code_adu_c_1_30',
 'atc_code_adu_c_1_31',
 'atc_code_adu_c_1_32',
                     
]

# # # Filter the dataframe to show only these variables
D4 = df4[variables_to_show]



In [8]:
# Define antidepressant ATC codes (strict N06A only)
antidepressant_atc_codes = [
    'N06A',  # All classic antidepressants
    'N06CA'  # Psychostimulants (optional - remove if only want core antidepressants)
]

# Melt to identify antidepressant users
df_long = D4.melt(
    id_vars=['project_pseudo_id'],
    value_vars=[col for col in D4.columns if col.startswith('atc_code_adu_c_1_')],
    value_name='atc_code'
)

# Create antidepressant flag
antidepressant_users = df_long[
    df_long['atc_code'].str.startswith(tuple(antidepressant_atc_codes), na=False)
]['project_pseudo_id'].unique()

# Create new dataframe (drop all ATC columns, keep only flag)
D4_new = D4.drop(
    columns=[col for col in D4.columns if col.startswith('atc_code_adu_c_1_')]
).assign(
    Antidepressant=lambda x: x['project_pseudo_id'].isin(antidepressant_users).astype(int)
)

# Verify
print(f"Antidepressant users: {D4_new['Antidepressant'].sum()}/{len(D4_new)}")


D4_explained = analyze_missing_values_and_uniques(D4_new)


#MAYBE INCLUDE LIDA TOO?
# #Antidepressant
# file6 = "deaq_q_1_results.csv"  # Change to any file you want to inspect


# # List of variables you want to show
# variables_to_show = ['project_pseudo_id','lidas_treatment_adu_q_03_b', 'lidas_treatment_adu_q_03_a' ]


# # lidas_treatment_adu_q_03_a	1	none of these treatments	
# # 	lidas_treatment_adu_q_03_b	1	antidepressants	


# # Read only the specified columns
# D6 = pd.read_csv(f"{path}/{file6}", usecols=variables_to_show)

# # If you need to see the columns
# print(D6.columns)

# # Convert columns to numeric
# D6["lidas_treatment_adu_q_03_a"] = pd.to_numeric(D6["lidas_treatment_adu_q_03_a"], errors='coerce')
# D6["lidas_treatment_adu_q_03_b"] = pd.to_numeric(D6["lidas_treatment_adu_q_03_b"], errors='coerce')

# # Create the new variable 'Antidepressant'
# D6["Antidepressant"] = D6.apply(
#     lambda row: 1 if row["lidas_treatment_adu_q_03_b"] == 1 else (0 if row["lidas_treatment_adu_q_03_a"] == 1 else None),
#     axis=1
# )




Antidepressant users: 8213/86457

--- project_pseudo_id ---
Missing: 0 (0.00%)
Total unique IDs: 86457
Duplicates found: 0

--- Antidepressant ---
Missing: 0 (0.00%)
Value distribution:
  0: 78244 (90.50%)
  1: 8213 (9.50%)


In [6]:
# Load one file and check the first rows
file5 = "1a_q_1_results.csv"  # Change to any file you want to inspect

df5 = pd.read_csv(f"{path}/{file5}")
df5.columns

# # List of variables you want to show
variables_to_show = ['project_pseudo_id', 
'diabetes_presence_adu_q_1', 

 'hypertension_treatment_adu_q_1', 'hypertension_presence_adu_q_1'
 ]

# Filter the dataframe to show only these variables
D5 = df5[variables_to_show]

D5_explained = analyze_missing_values_and_uniques(D5)

In [7]:
#blood pressure
file6 = "1a_v_1_results.csv"


df6 = pd.read_csv(f"{path}/{file6}")


df6 = df6[df6['variant_id'] == '1a_v_1_bp_8plus_v1']

variables_to_show = ['project_pseudo_id', 'bpavg_diastolic_all_m_1', 
'bpavg_systolic_all_m_1']

# # # Filter the dataframe to show only these variables
D6 = df6[variables_to_show]

In [13]:
path = "/groups/umcg-lifelines/prm03/projects/ov20_0110/dataset_order_202502/Results"

# List contents
print(os.listdir(path))

['deaq_q_1_results.csv', 'depq_q_1_results.csv', '1b_q_1_results.csv', '1a_v_2_results.csv', 'global_summary.csv', 'salt_v_1_results.csv', '3a_v_2_results.csv', '2b_q_1_results.csv', '2a_v_2_results.csv']


In [14]:
#family history 
file7 = "1b_q_1_results.csv"  # Change to any file you want to inspect

df7 = pd.read_csv(f"{path}/{file7}")
df7.columns

# # List of variables you want to show
variables_to_show = ['project_pseudo_id','cvd_father_fam_q_1_a', 'cvd_father_fam_q_1_b', 
'cvd_father_fam_q_1_c', 'cvd_father_fam_q_1_d', 'cvd_father_fam_q_1_e', 'cvd_father_fam_q_1_f', 
'cvd_mother_fam_q_1_a', 'cvd_mother_fam_q_1_b', 'cvd_mother_fam_q_1_c', 'cvd_mother_fam_q_1_d', 
'cvd_mother_fam_q_1_e', 'cvd_mother_fam_q_1_f', 'cvd_siblings_fam_q_1_a', 'cvd_siblings_fam_q_1_b',
'cvd_siblings_fam_q_1_c', 'cvd_siblings_fam_q_1_d', 'cvd_siblings_fam_q_1_e', 'cvd_siblings_fam_q_1_f' ]

# # Filter the dataframe to show only these variables
D7 = df7[variables_to_show]

In [15]:
#trauma  
file8 = "2b_q_1_results.csv"  # Change to any file you want to inspect

df8 = pd.read_csv(f"{path}/{file8}")
df8.columns

# # # List of variables you want to show
variables_to_show = ['project_pseudo_id','ctq_emotionalabuse_adu_q_08', 'ctq_emotionalabuse_adu_q_18',
    'ctq_emotionalneglect_adu_q_07', 'ctq_physicalabuse_adu_q_09',
    'ctq_physicalabuse_adu_q_11', 'ctq_physicalabuse_adu_q_12',
    'ctq_physicalabuse_adu_q_15', 'ctq_physicalabuse_adu_q_17',
    'ctq_physicalneglect_adu_q_26', 'ctq_sexualabuse_adu_q_20',
    'ctq_sexualabuse_adu_q_21', 'ctq_sexualabuse_adu_q_23',
    'ctq_sexualabuse_adu_q_24', 'ctq_sexualabuse_adu_q_27' ]

# # # Filter the dataframe to show only these variables
D8 = df8[variables_to_show]

  df8 = pd.read_csv(f"{path}/{file8}")


In [None]:
# # #biomarkers 
# file9 = "1a_v_2_results.csv"  # Change to any file you want to inspect
# df9 = pd.read_csv(f"{path}/{file9}")
# df9['variant_id'].unique()

In [8]:

# biomarkers 
file11 = "1a_v_2_results.csv"  # Change to any file you want to inspect

df11 = pd.read_csv(f"{path}/{file11}")
df11['variant_id'].unique()

df11_ = df11[df11['variant_id'].isin([ '1a_v_2_bloodcells1_8plus_v1', '1a_v_2_bloodcells1_8plus_v2' ])] 
                                   


# # # List of variables you want to show
variables_to_show = ['project_pseudo_id', 'hba1cconc_result_all_m_1'] 

# # # Filter the dataframe to show only these variables
D11 = df11_[variables_to_show]

In [9]:
#biomarkers 
file9 = "1a_v_2_results.csv"  # Change to any file you want to inspect

df9 = pd.read_csv(f"{path}/{file9}")
df9

df9_ = df9[df9['variant_id'].isin(['1a_v_2_bloodplasma1_8plus_v1'     ,'1a_v_2_bloodplasma1_8plus_v2'])] 
                                   

# # List of variables you want to show
variables_to_show = ['project_pseudo_id','age','gender', 'cholesterol_result_all_m_1', 'hdlchol_result_all_m_1',
                     'ldlchol_result_all_m_1',
              'triglyceride_result_all_m_1', 'glucose_result_all_m_1'
                   ] 

# # Filter the dataframe to show only these variables
D9 = df9_[variables_to_show]

In [10]:
from functools import reduce
import pandas as pd

# List of DataFrames to merge
dfs = [D0,
D1,
D2,
D3_clean,
D4_new,
D5,
D6,
D7,
D8,
D9, D11]

# Merge all DataFrames
merged_df = reduce(lambda left, right: pd.merge(left, right, on='project_pseudo_id', how='outer'), dfs)

# 2. Standardize all missing values to NaN (not <NA>)
merged_df = merged_df.fillna(np.nan)
# 1. Replace specified dollar values with NaN
values_to_replace = ['$6', '$7', '$4', '$5']
merged_df.replace(values_to_replace, np.nan, inplace=True)


In [27]:
merged_df = merged_df.dropna(subset=['gender'])
merged_df = merged_df.dropna(subset=['age'])
merged_df = merged_df.dropna(subset=['broad_cvd'])

In [28]:

# cvd_father_fam_q_1_a	1	yes	
# 	cvd_father_fam_q_1_a	2	no	
# 	cvd_father_fam_q_1_a	3	i do not know	

# Define the columns related to family history
cvd_columns = [
    'cvd_father_fam_q_1_a', 'cvd_father_fam_q_1_b', 'cvd_father_fam_q_1_c',
    'cvd_father_fam_q_1_d', 'cvd_father_fam_q_1_e', 'cvd_father_fam_q_1_f'
]

def calculate_family_history(row):
    """
    Calculate family history of CVD for the father.
    - If any column has a value of 1 (Yes), return 1 (Yes).
    - If all columns have a value of 0 (No), return 0 (No).
    - If all columns are NaN, return NaN (Missing).
    """
    if (row == '1').any():  # If any column has a value of 1 (Yes)
        return 1
    elif (row == '2').all():  # If all columns have a value of 0 (No)
        return 0
    else:  # If all columns are NaN or a mix of NaN and 0
        return np.nan

# Apply the function to create the new variable
merged_df['cvd_father_family_history'] = merged_df[cvd_columns].apply(calculate_family_history, axis=1)

# Check the results
print("Value counts in 'cvd_father_family_history':")
print(merged_df['cvd_father_family_history'].value_counts(dropna=False))


# Define the columns related to family history
cvd_columns = [
    'cvd_mother_fam_q_1_a',
 'cvd_mother_fam_q_1_b',
 'cvd_mother_fam_q_1_c',
 'cvd_mother_fam_q_1_d',
 'cvd_mother_fam_q_1_e',
 'cvd_mother_fam_q_1_f',
]

def calculate_family_history(row):
    """
    Calculate family history of CVD for the father.
    - If any column has a value of 1 (Yes), return 1 (Yes).
    - If all columns have a value of 0 (No), return 0 (No).
    - If all columns are NaN, return NaN (Missing).
    """
    if (row == '1').any():  # If any column has a value of 1 (Yes)
        return 1
    elif (row == '2').all():  # If all columns have a value of 0 (No)
        return 0
    else:  # If all columns are NaN or a mix of NaN and 0
        return np.nan

# Apply the function to create the new variable
merged_df['cvd_mother_family_history'] = merged_df[cvd_columns].apply(calculate_family_history, axis=1)

# Check the results
print("Value counts in 'cvd_father_family_history':")
print(merged_df[['cvd_mother_family_history']].value_counts(dropna=False))



cvd_columns = [
   'cvd_siblings_fam_q_1_a',
 'cvd_siblings_fam_q_1_b',
 'cvd_siblings_fam_q_1_c',
 'cvd_siblings_fam_q_1_d',
 'cvd_siblings_fam_q_1_e',
 'cvd_siblings_fam_q_1_f',
]

def calculate_family_history(row):
    """
    Calculate family history of CVD for the father.
    - If any column has a value of 1 (Yes), return 1 (Yes).
    - If all columns have a value of 0 (No), return 0 (No).
    - If all columns are NaN, return NaN (Missing).
    """
    if (row == '1').any():  # If any column has a value of 1 (Yes)
        return 1
    elif (row == '2').all():  # If all columns have a value of 0 (No)
        return 0
    else:  # If all columns are NaN or a mix of NaN and 0
        return np.nan

# Apply the function to create the new variable
merged_df['cvd_siblings_family_history'] = merged_df[cvd_columns].apply(calculate_family_history, axis=1)

# Check the results
print("Value counts in 'cvd_siblings_family_history':")
print(merged_df[['cvd_siblings_family_history']].value_counts(dropna=False))



Value counts in 'cvd_father_family_history':
cvd_father_family_history
NaN    66367
1.0    31293
0.0     2498
Name: count, dtype: int64
Value counts in 'cvd_father_family_history':
cvd_mother_family_history
NaN                          79128
1.0                          17822
0.0                           3208
Name: count, dtype: int64
Value counts in 'cvd_siblings_family_history':
cvd_siblings_family_history
NaN                            92551
1.0                             7603
0.0                                4
Name: count, dtype: int64


In [29]:
# Define the function to calculate the final family history variable
def calculate_final_family_history(row):
    siblings = row["cvd_siblings_family_history"]
    mother = row["cvd_mother_family_history"]
    father = row["cvd_father_family_history"]

    # If any of the three variables is 1, return 1
    if siblings == 1 or mother == 1 or father == 1:
        return 1
    # If all three variables are 0, return 0
    elif siblings == 0 and mother == 0 and father == 0:
        return 0
    # If all three variables are NaN, return NaN
    elif pd.isna(siblings) and pd.isna(mother) and pd.isna(father):
        return np.nan
    # If there is a mix of 0 and NaN, return 0
    else:
        return 0

# Apply the function to create the final family history variable
merged_df["family_history"] = merged_df.apply(calculate_final_family_history, axis=1)

# Check the results
print("Value counts in 'family_history':")
print(merged_df["family_history"].value_counts(dropna=False))


Value counts in 'family_history':
family_history
NaN    54936
1.0    43087
0.0     2135
Name: count, dtype: int64


In [11]:
columns_to_drop = [
  'cvd_father_family_history', 
'cvd_mother_family_history', 'cvd_siblings_family_history',
    'cvd_father_fam_q_1_a',
 'cvd_father_fam_q_1_b',
 'cvd_father_fam_q_1_c',
 'cvd_father_fam_q_1_d',
 'cvd_father_fam_q_1_e',
 'cvd_father_fam_q_1_f',
 'cvd_mother_fam_q_1_a',
 'cvd_mother_fam_q_1_b',
 'cvd_mother_fam_q_1_c',
 'cvd_mother_fam_q_1_d',
 'cvd_mother_fam_q_1_e',
 'cvd_mother_fam_q_1_f',
 'cvd_siblings_fam_q_1_a',
 'cvd_siblings_fam_q_1_b',
 'cvd_siblings_fam_q_1_c',
 'cvd_siblings_fam_q_1_d',
 'cvd_siblings_fam_q_1_e',
 'cvd_siblings_fam_q_1_f', 


]
merged_df = merged_df.drop(columns=columns_to_drop)  # 'errors="ignore"' prevents errors if a column is missing