# Final datasets preparation and Feature Engineering

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv(r"C:\Users\bedge\Desktop\NHANES\MergedDataR_new.csv") 

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,persWt,examWt,psu,strata,gender,age,race,pregnancy_status,...,diastolicBP2,systolicBP3,diastolicBP3,urine_alb_creat,hdl_cholesterol,triglycerides,total_cholesterol,fasting.glucose,hba1c,hscrp
0,4,109266,7825.646112,8154.968193,2,168,2,29,5,2.0,...,55.0,99.0,52.0,15.28,56.0,,195.0,,5.2,0.72
1,5,109267,26379.991724,0.0,1,156,2,21,2,3.0,...,,,,,,,,,,
2,9,109271,8481.589837,8658.732873,1,167,1,49,3,,...,68.0,111.0,68.0,7.5,33.0,84.0,147.0,103.0,5.6,28.68
3,11,109273,20171.847767,22163.59685,1,155,1,36,3,,...,66.0,115.0,68.0,4.05,42.0,,164.0,,5.1,0.98
4,12,109274,7227.993241,7801.600218,2,167,1,68,5,,...,69.0,132.0,71.0,10.67,29.0,133.0,105.0,154.0,5.7,1.6


In [4]:
data.shape

(9232, 40)

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,9232.0,7799.091746,4488.309008,4.0,3919.25,7759.5,11683.25,15560.0
id,9232.0,117061.091746,4488.309008,109266.0,113181.25,117021.5,120945.25,124822.0
persWt,9232.0,26045.222372,30129.287779,1876.867755,9222.482331,15775.584026,29598.676751,338363.600192
examWt,9232.0,26045.222382,32251.59851,0.0,8453.811011,15583.537858,30028.847927,367555.742979
psu,9232.0,1.541919,0.543202,1.0,1.0,2.0,2.0,3.0
strata,9232.0,160.227253,6.958216,149.0,154.0,160.0,166.0,172.0
gender,9232.0,1.51484,0.499807,1.0,1.0,2.0,2.0,2.0
age,9232.0,51.144931,17.686659,20.0,36.0,52.0,65.0,80.0
race,9232.0,3.273289,1.19461,1.0,3.0,3.0,4.0,5.0
pregnancy_status,1874.0,2.051227,0.376202,1.0,2.0,2.0,2.0,3.0


# Feature Engineering 

In [6]:
# we used National Cholesterol Education Program, Adult Treatment Panel III (NCEP ATP III) criteria
#citation: 'Grundy SM, Becker D. Detection, evaluation, and treatment of high blood cholesterol in adults (Adult Treatment Panel III). Circulation. 2002;106(25):3143–421"
# Define a function to apply the criteria and return 1 or 0
#criteria_1: Abdominal_obesity
def abdominal_obesity(row):
    if (row['gender'] == 1 and row['waist_circumference'] > 102) or
        (row['gender'] == 2 and row['waist_circumference'] > 88): #gender 1 is Male and 2 is Female
        return 1
    else:
        return 0
#criteria_2: Triglycerides_high
def triglycerides_high(row):
    if(row['triglycerides'] >=150): 
        return 1
    else:
        return 0
#criteria_3: HDL_low
def HDL_low(row):
    if (row['gender'] == 1 and row['hdl_cholesterol'] < 40) or 
        (row['gender'] == 2 and row['hdl_cholesterol'] < 50): #gender 1 is Male and 2 is Female
        return 1
    else:
        return 0
#Criteria_4: High_BP
def High_BP(row):
    #if (row['systolicBP1'] or row['systolicBP2'] or row['systolicBP3'] >= 130) or (row['diastolicBP1'] or row['diastolicBP2'] or row['diastolicBP3'] >=85):
    if (row['systolicBP1'] >= 130 or row['systolicBP2'] >= 130 or 
        row['systolicBP3'] >= 130) or (row['diastolicBP1'] >= 85 or 
        row['diastolicBP2'] >= 85 or row['diastolicBP3'] >= 85) or row['bp_medication']==1:

        return 1
    else:
        return 0
#Criteria_5:Fasting_Glucose >=100
def abnormal_glucose(row):
    if(row['fasting.glucose'] >=100 or row['antidiabetic_pills']==1 or row['take_insulin'] == 1):
        return 1
    else:
        return 0

# Apply the function to create the new column for each criteria
data['Abdominal_obesity'] = data.apply(abdominal_obesity, axis=1)
data['Triglycerides_high'] = data.apply(triglycerides_high,axis=1)
data['HDL_low'] = data.apply(HDL_low, axis=1)
data['High_BP'] = data.apply(High_BP, axis=1)
data['Abnormal_glucose']= data.apply(abnormal_glucose, axis=1)


#Criteria Definining Metabolic syndrome

def Metabolic_syndrome(row):
    criteria_count = row['Abdominal_obesity'] + row['Triglycerides_high'] + row['HDL_low'] + row['High_BP'] + row['Abnormal_glucose']
    if criteria_count >= 3:
        return 1
    else:
        return 0
    
# Apply function to create new column for the Metabolic syndrome
data['metabolic_syndrome'] = data.apply(Metabolic_syndrome, axis=1)



In [7]:
data.shape

(9232, 46)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,persWt,examWt,psu,strata,gender,age,race,pregnancy_status,...,total_cholesterol,fasting.glucose,hba1c,hscrp,Abdominal_obesity,Triglycerides_high,HDL_low,High_BP,Abnormal_glucose,metabolic_syndrome
0,4,109266,7825.646112,8154.968193,2,168,2,29,5,2.0,...,195.0,,5.2,0.72,1,0,0,0,0,0
1,5,109267,26379.991724,0.0,1,156,2,21,2,3.0,...,,,,,0,0,0,0,0,0
2,9,109271,8481.589837,8658.732873,1,167,1,49,3,,...,147.0,103.0,5.6,28.68,1,0,1,0,1,1
3,11,109273,20171.847767,22163.59685,1,155,1,36,3,,...,164.0,,5.1,0.98,0,0,0,0,0,0
4,12,109274,7227.993241,7801.600218,2,167,1,68,5,,...,105.0,154.0,5.7,1.6,1,0,1,1,1,1


In [9]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,9232.0,7799.091746,4488.309008,4.0,3919.25,7759.5,11683.25,15560.0
id,9232.0,117061.091746,4488.309008,109266.0,113181.25,117021.5,120945.25,124822.0
persWt,9232.0,26045.222372,30129.287779,1876.867755,9222.482331,15775.584026,29598.676751,338363.600192
examWt,9232.0,26045.222382,32251.59851,0.0,8453.811011,15583.537858,30028.847927,367555.742979
psu,9232.0,1.541919,0.543202,1.0,1.0,2.0,2.0,3.0
strata,9232.0,160.227253,6.958216,149.0,154.0,160.0,166.0,172.0
gender,9232.0,1.51484,0.499807,1.0,1.0,2.0,2.0,2.0
age,9232.0,51.144931,17.686659,20.0,36.0,52.0,65.0,80.0
race,9232.0,3.273289,1.19461,1.0,3.0,3.0,4.0,5.0
pregnancy_status,1874.0,2.051227,0.376202,1.0,2.0,2.0,2.0,3.0


In [10]:
missing_values = data.isnull().sum()
print(missing_values.sort_values(ascending=True))

Unnamed: 0                           0
High_BP                              0
HDL_low                              0
Triglycerides_high                   0
Abdominal_obesity                    0
Abnormal_glucose                     0
diabetes_question                    0
chol_question                        0
bp_question                          0
health_insurance                     0
marital_status                       0
metabolic_syndrome                   0
id                                   0
race                                 0
persWt                               0
age                                  0
examWt                               0
gender                               0
psu                                  0
strata                               0
education                            0
food_security                      689
weight                             830
height                             837
monthly_poverty_level_category     849
bmi                      

In [11]:
data.shape

(9232, 46)

In [12]:
#Exclude pregnant patients
data = data[data['pregnancy_status'] != 1]

In [13]:
data.shape

(9145, 46)

In [14]:
# Drop the column we dont want:
df = data.drop(columns=['pregnancy_status'], axis=1)

In [15]:
# Drop the first unnamed column
df.drop(columns=df.columns[df.columns.str.contains('Unnamed')][0], inplace=True)


In [16]:
df.shape

(9145, 44)

In [17]:
# Save DataFrame as CSV
df.to_csv('merged_with_target.csv', index=False)
