# Bahamas

In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load data
# data = pd.read_csv(r'C:\Users\georg\Documents\BPC project\steps_dataset\bahamas\bhs2011.csv') # windows george
data = pd.read_csv(r'../../steps_dataset/bahamas/bhs2011.csv') # mac ekaba

In [3]:
data.head()

Unnamed: 0,pid,i1,i3,i4,i6,c1,c2,c3,c4,c5,...,m16a,m16b,m16c,age,agerange,sex,psu,stratum,wstep1,wstep2
0,442,1,700,28-07-11,1,2,28-12-70,,12.0,5.0,...,86.0,91.0,95.0,40,35-44,Women,1,1,20.950375,20.950375
1,1126,1,790,25-06-11,1,2,11-10-73,,18.0,7.0,...,87.0,82.0,81.0,37,35-44,Women,1,1,41.900749,41.900749
2,143,1,2565,29-06-11,1,1,06-11-61,,15.0,7.0,...,75.0,87.0,75.0,49,45-54,Men,1,1,79.712131,79.712131
3,1584,2,4895,16-08-11,1,2,03-07-80,,16.0,7.0,...,77.0,91.0,95.0,31,25-34,Women,2,1,55.900454,55.900454
4,1585,2,4895,16-08-11,1,1,04-03-53,,16.0,7.0,...,75.0,79.0,76.0,58,55-64,Men,2,1,69.493028,69.493028


In [4]:
print('Basic Information:')
print(f'Number of rows: {len(data)}')
print(f'Number of columns: {data.shape[1]}')

Basic Information:
Number of rows: 1643
Number of columns: 161


In [5]:
# create final dataframe - this dataframe is to be appended to as we create the new features
cleaned_data = pd.DataFrame()

In [6]:
# append country
country = ['bahamas'] * data.shape[0]
country = pd.DataFrame({'country': country})
cleaned_data = pd.concat([cleaned_data, country], axis = 1)

In [7]:
# append sex - (category)
cleaned_data = pd.concat([cleaned_data, data['sex']], axis = 1)

In [8]:
# append age - (numeric)
cleaned_data = pd.concat([cleaned_data, data['age']], axis = 1)
cleaned_data['age'] = pd.to_numeric(cleaned_data['age'], errors='coerce')

In [9]:
# append years-at-school - (numeric)
cleaned_data = pd.concat([cleaned_data, data['c4']], axis = 1)
cleaned_data.rename(columns={'c4':'years-at-school'}, inplace=True)
cleaned_data['years-at-school'] = pd.to_numeric(cleaned_data['years-at-school'], errors='coerce')

In [10]:
# append level-of-education - (category)
cleaned_data = pd.concat([cleaned_data, data['c5']], axis = 1)
cleaned_data.rename(columns={'c5':'level-of-education'}, inplace=True)

In [11]:
# append marital-status - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['c7']], axis = 1)
    cleaned_data.rename(columns={'c7':'marital-status'}, inplace=True)
except KeyError:
    cleaned_data['marital-status'] = np.nan

In [12]:
# append work-status - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['c8']], axis = 1)
    cleaned_data.rename(columns={'c8':'work-status'}, inplace=True)
except KeyError:
    cleaned_data['work-status'] = np.nan

In [13]:
# append ppl-in-household - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['c9']], axis = 1)
    cleaned_data.rename(columns={'c9':'ppl-in-household'}, inplace=True)
    cleaned_data['ppl-in-household'] = pd.to_numeric(cleaned_data['ppl-in-household'], errors='coerce')
except KeyError:
    cleaned_data['ppl-in-household'] = np.nan

In [14]:
# append earnings per year - (numeric)
try:
    earnings_per_year = []
    for index, row in data.iterrows():
        if row['c10a'].is_integer():
            earnings_per_year.append(int(row['c10a']) * 52.1429) # weeks to year
        elif row['c10b'].is_integer():
            earnings_per_year.append(int(row['c10b']) * 12) # months to year
        elif row['c10c'].is_integer():
            earnings_per_year.append(int(row['c10c'])) # year
        else:
            earnings_per_year.append(np.nan)
    

    earnings_per_year = pd.DataFrame({'earnings-per-year': earnings_per_year})
    cleaned_data = pd.concat([cleaned_data, earnings_per_year], axis = 1)
    cleaned_data['earnings-per-year'] = pd.to_numeric(cleaned_data['earnings-per-year'], errors='coerce')
except KeyError:
    cleaned_data['earnings-per-year'] = np.nan

In [15]:
# append - Do you currently smoke tobacco products? - (category)
cleaned_data = pd.concat([cleaned_data, data['t1']], axis = 1)
cleaned_data.rename(columns={'t1':'currently-smoke-tobacco'}, inplace=True)

In [16]:
# append - How old were you when you first started smoking? - (numeric)
cleaned_data = pd.concat([cleaned_data, data['t3']], axis = 1)
cleaned_data.rename(columns={'t3':'age-started-smoking'}, inplace=True)
cleaned_data['age-started-smoking'] = pd.to_numeric(cleaned_data['age-started-smoking'], errors='coerce')

In [17]:
# append - Length of time smoking - (numeric)
try:
        length_time_smoking = []
        for index, row in data.iterrows():
            if row['t4c'].is_integer():
                if row['t4c'] == 77: # convert 77 (don't know) to np.nan
                    length_time_smoking.append(np.nan)
                else:
                    length_time_smoking.append(int(row['t4c']) * 52.1429) # weeks to year
            elif row['t4b'].is_integer():
                if row['t4b'] == 77: # convert 77 (don't know) to np.nan
                    length_time_smoking.append(np.nan)
                else:
                    length_time_smoking.append(int(row['t4b']) * 12) # months to year
            elif row['t4a'].is_integer():
                if row['t4a'] == 77: # convert 77 (don't know) to np.nan
                    length_time_smoking.append(np.nan)
                else:
                    length_time_smoking.append(int(row['t4a'])) # year
            else:
                length_time_smoking.append(np.nan)
        
        length_time_smoking = pd.DataFrame({'length-time-smoking': length_time_smoking})
        cleaned_data = pd.concat([cleaned_data, length_time_smoking], axis = 1)
        cleaned_data['length-time-smoking'] = pd.to_numeric(cleaned_data['length-time-smoking'], errors='coerce')
except KeyError:
    cleaned_data['length-time-smoking'] = np.nan

In [18]:
# append - Number of smoke/ smokeless per day (tobacco products) - (numeric)
# append - What type of tobacco do you use? - (category)
number_tobacco = []
type_tobacco = []

for index, row in data.iterrows():
    tobacco = []
    tobacco.append(int(row['t5a'])) if row['t5a'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5aw']) / 7, 0)) if row['t5aw'].is_integer() else  tobacco.append(0)
    tobacco.append(int(row['t5b'])) if row['t5b'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5bw']) / 7, 0)) if row['t5bw'].is_integer() else  tobacco.append(0)
    tobacco.append(int(row['t5c'])) if row['t5c'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5cw']) / 7, 0)) if row['t5cw'].is_integer() else  tobacco.append(0)
    tobacco.append(int(row['t5d'])) if row['t5d'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5dw']) / 7, 0)) if row['t5dw'].is_integer() else  tobacco.append(0)
    tobacco.append(int(row['t5e'])) if row['t5e'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5ew']) / 7, 0)) if row['t5ew'].is_integer() else  tobacco.append(0)
    # tobacco.append(int(row['t5f'])) if row['t5f'].is_integer() else  tobacco.append(0)
    # tobacco.append(round(int(row['t5fw']) / 7, 0)) if row['t5fw'].is_integer() else  tobacco.append(0)
    
    number_tobacco.append(max(tobacco)) if max(tobacco) != 0 else number_tobacco.append(np.nan)
    
    # type of tobacco
    index = tobacco.index(max(tobacco)) if max(tobacco) != 0 else 99
    
    if index == 0 or index == 1:
        type_tobacco.append('manufactured cigarettes')
    elif index == 2 or index == 3:
        type_tobacco.append('hand-rolled cigarettes')
    elif index == 4 or index == 5:
        type_tobacco.append('tobacco pipes')
    elif index == 6 or index == 7:
        type_tobacco.append('cigars, cheroots, cigarillos')
    elif index == 8 or index == 9:
        type_tobacco.append('shisha')
    elif index == 10 or index == 11:
        type_tobacco.append('other tobacco product')
    else:
        type_tobacco.append('none')
        
# append number of tobacco
number_tobacco = pd.DataFrame({'number-tobacco': number_tobacco})
cleaned_data = pd.concat([cleaned_data, number_tobacco], axis = 1)
cleaned_data['number-tobacco'] = pd.to_numeric(cleaned_data['number-tobacco'], errors='coerce')

# append type of robacco
type_tobacco = pd.DataFrame({'type-tobacco': type_tobacco})
cleaned_data = pd.concat([cleaned_data, type_tobacco], axis = 1)


In [19]:
# append - How old when you stopped smoking - (numeric)
age_stopped_smoking = []
try:
    for index, row in data.iterrows():    
        if row['t7'].is_integer():
            age_stopped_smoking.append(int(row['t7'])) if int(row['t7']) != 77 else age_stopped_smoking.append(np.nan)
        else:
            age_stopped_smoking.append(np.nan)
                
            
    age_stopped_smoking = pd.DataFrame({'age-stopped-smoking': age_stopped_smoking})
    cleaned_data = pd.concat([cleaned_data, age_stopped_smoking], axis = 1)
    cleaned_data['age-stopped-smoking'] = pd.to_numeric(cleaned_data['age-stopped-smoking'], errors='coerce')
except KeyError:
    cleaned_data['age-stopped-smoking'] = np.nan

In [20]:
# append - Smoke in home or workplace? - (category)
smoke_home_workplace = []
for index, row in data.iterrows():
    if row['t13'].is_integer() or row['t13'].is_integer():
        if int(row['t13']) == 1 or int(row['t14']) == 1:
            smoke_home_workplace.append(1)
        else:
            smoke_home_workplace.append(2)
    else:
        smoke_home_workplace.append(np.nan)
        
smoke_home_workplace = pd.DataFrame({'smoke-home-workplace': smoke_home_workplace})
cleaned_data = pd.concat([cleaned_data, smoke_home_workplace], axis = 1)

In [21]:
# append - How you ever consumed alcohol? - (category)
consumed_alcohol = []
for index, row in data.iterrows():
    a1a = a1b = 0
    if row['a1a'].is_integer():
        a1a = 0 if int(row['a1a']) == 2 else 1
        if row['a1b'].is_integer():
            a1b = 0 if int(row['a1b']) == 2 else 1
        
        out = int(a1a or a1b)
        consumed_alcohol.append(out)
    else:
        consumed_alcohol.append(np.nan)
        
consumed_alcohol = pd.DataFrame({'consumed-alcohol': consumed_alcohol})
cleaned_data = pd.concat([cleaned_data, consumed_alcohol], axis = 1)

In [22]:
# append - Have you stopped drinking due to health reasons? - (category)
try:
    quit_drinking_for_health = []
    for index, row in data.iterrows():
        if row['a3'].is_integer():
            quit_drinking_for_health.append(int(row['a3']))
        else:
            quit_drinking_for_health.append(np.nan)
            
    quit_drinking_for_health = pd.DataFrame({'quit-drinking-for-health': quit_drinking_for_health})
    cleaned_data = pd.concat([cleaned_data, quit_drinking_for_health], axis = 1)
except KeyError:
    cleaned_data['quit-drinking-for-health'] = np.nan

In [23]:
# append - How many alcoholic drinks do you consume per day? - (category)
number_alcoholic_drinks = []
for index, row in data.iterrows():
    if row['a2'].is_integer():
        number_alcoholic_drinks.append(int(row['a2']))
    else:
        number_alcoholic_drinks.append(np.nan)
        
number_alcoholic_drinks = pd.DataFrame({'number-alcoholic-drinks': number_alcoholic_drinks})
cleaned_data = pd.concat([cleaned_data, number_alcoholic_drinks], axis = 1)

In [24]:
# append - How many fruit/ vegetables do you eat per day? - (numeric)
number_daily_fruit_vegetables = []
for index, row in data.iterrows():
    fruit = 0
    vegetable = 0
    
    # fruit
    if row['d1'].is_integer():
        if row['d2'].is_integer():
            fruit = int(row['d1']) * int(row['d2'])
    else:
            fruit = 0
    
    # vegetables
    if row['d3'].is_integer():
        if row['d4'].is_integer():
            vegetable = int(row['d3']) * int(row['d4'])
    else:
            vegetable = 0
            
    number_daily_fruit_vegetables.append(fruit + vegetable)
        
number_daily_fruit_vegetables = pd.DataFrame({'number-daily-fruit-vegetables': number_daily_fruit_vegetables})
cleaned_data = pd.concat([cleaned_data, number_daily_fruit_vegetables], axis = 1)

In [25]:
# append - How much salt or salty sauce do you think you consume?  - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['d8']], axis = 1)
    cleaned_data.rename(columns={'d8':'salt-consumption'}, inplace=True)
except KeyError:
    cleaned_data['salt-consumption'] = np.nan

In [26]:
# append - How intense is your work [vigorous- intensity, moderate-intensity, none]? - (cateogry)
work_intensity = []
for index, row in data.iterrows():
    if row['p1'].is_integer() or row['p4'].is_integer():
        if row['p1'].is_integer() and int(row['p1']) == 1:
            work_intensity.append('vigorous-intensity')
        elif row['p4'].is_integer() and int(row['p4']) == 1 and int(row['p1']) != 1:
            work_intensity.append('moderate-intensity')
        else:
             work_intensity.append('none')
    else:
        work_intensity.append(np.nan)
        
work_intensity = pd.DataFrame({'work-intensity': work_intensity})
cleaned_data = pd.concat([cleaned_data, work_intensity], axis = 1)

In [27]:
# append - In a typical week, on how many days do you do moderate or vigorous-intensity activities as part of your work? - (numeric)
days_vigorous_exercise = []
cleaned_data = pd.concat([cleaned_data, data['p2']], axis = 1)
cleaned_data.rename(columns={'p2':'days-vigorous-exercise'}, inplace=True)
cleaned_data['days-vigorous-exercise'] = pd.to_numeric(cleaned_data['days-vigorous-exercise'], errors='coerce')

days_moderate_exercise = []
cleaned_data = pd.concat([cleaned_data, data['p5']], axis = 1)
cleaned_data.rename(columns={'p5':'days-moderate-exercise'}, inplace=True)
cleaned_data['days-moderate-exercise'] = pd.to_numeric(cleaned_data['days-moderate-exercise'], errors='coerce')

In [28]:
# append - How much time do you spend walking or bicycling for travel on a typical day? - (numeric)
time_walking_bicycling_minutes = []

for index, row in data.iterrows():
    if row['p9a'].is_integer() or row['p9b'].is_integer():
        hour_to_minute = minute = 0
         
        if row['p9a'].is_integer():
            hour_to_minute = int(row['p9a']) * 60
        elif row['p9b'].is_integer():
            minute = int(row['p9b'])
        
        time_walking_bicycling_minutes.append(hour_to_minute + minute)
    else:
        time_walking_bicycling_minutes.append(np.nan)
        
time_walking_bicycling_minutes = pd.DataFrame({'time-walking-bicycling-minutes': time_walking_bicycling_minutes})
cleaned_data = pd.concat([cleaned_data, time_walking_bicycling_minutes], axis = 1)

In [29]:
# append - How much time do you usually spend sitting or reclining on a typical day? - (numeric)
try:
    time_sedentary = []

    for index, row in data.iterrows():
        if row['p16a'].is_integer() or row['p16b'].is_integer():
            hour_to_minute = minute = 0
            
            if row['p16a'].is_integer():
                hour_to_minute = int(row['p16a']) * 60
            elif row['p16b'].is_integer():
                minute = int(row['p16b'])
            
            time_sedentary.append(hour_to_minute + minute)
        else:
            time_sedentary.append(np.nan)
            
    time_sedentary = pd.DataFrame({'time-sedentary': time_sedentary})
    cleaned_data = pd.concat([cleaned_data, time_sedentary], axis = 1)
except KeyError:
    cleaned_data['time-sedentary'] = np.nan

In [30]:
# append - Have you ever had your blood pressure measured by a doctor or other health worker? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['h2a']], axis = 1)
    cleaned_data.rename(columns={'h2a':'had-blood-pressure-measurement'}, inplace=True)
except KeyError:
    cleaned_data['had-blood-pressure-measurement'] = np.nan

In [31]:
# append - Have you taken any drugs (medication) for raised blood pressure? - (category)
cleaned_data = pd.concat([cleaned_data, data['h3a']], axis = 1)
cleaned_data.rename(columns={'h3a':'taken-drugs-for-raised-bp'}, inplace=True)
cleaned_data['taken-drugs-for-raised-bp'] = pd.to_numeric(cleaned_data['taken-drugs-for-raised-bp'], errors='coerce')

In [32]:
# append - Have you ever had your blood sugar measured by a doctor or other health worker? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['h6']], axis = 1)
    cleaned_data.rename(columns={'h6':'had-blood-sugar-measurement'}, inplace=True)
    cleaned_data['had-blood-sugar-measurement'] = pd.to_numeric(cleaned_data['had-blood-sugar-measurement'], errors='coerce')
except KeyError:
    cleaned_data['had-blood-sugar-measurement'] = np.nan

In [33]:
# append - Have you taken any drugs (medication) for diabetes? - (category)
taken_diabetes_drugs = []
for index, row in data.iterrows():
    h8a = h8b = 0
    if row['h8a'].is_integer():
        h8a = 0 if int(row['h8a']) == 2 else 1
        if row['h8b'].is_integer():
            h8b = 0 if int(row['h8b']) == 2 else 1
        
        out = int(h8a or h8b)
        
        taken_diabetes_drugs.append(out)
    else:
        taken_diabetes_drugs.append(np.nan)
        
taken_diabetes_drugs = pd.DataFrame({'taken-diabetes-drugs': taken_diabetes_drugs})
cleaned_data = pd.concat([cleaned_data, taken_diabetes_drugs], axis = 1)

In [34]:
# append - Have you ever had your cholesterol (fat levels in your blood) measured by a doctor or other health worker? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['h12']], axis = 1)
    cleaned_data.rename(columns={'h12':'had-cholesterol-measurement'}, inplace=True)
    cleaned_data['had-cholesterol-measurement'] = pd.to_numeric(cleaned_data['had-cholesterol-measurement'], errors='coerce')
except KeyError:
    cleaned_data['had-cholesterol-measurement'] = np.nan

In [35]:
# append - Have you taken any drugs (medication) for diabetes? - (category)
had_cholesterol_measurement = []
for index, row in data.iterrows():
    l1a = l2a = 0
    if row['l1a'].is_integer():
        l1a = 0 if int(row['l1a']) == 2 else 1
        if row['l2a'].is_integer():
            l2a = 0 if int(row['l2a']) == 2 else 1
        
        out = int(l1a or l2a)
        
        had_cholesterol_measurement.append(out)
    else:
        had_cholesterol_measurement.append(np.nan)
        
had_cholesterol_measurement = pd.DataFrame({'had-cholesterol-measurement': had_cholesterol_measurement})
cleaned_data = pd.concat([cleaned_data, had_cholesterol_measurement], axis = 1)

In [36]:
# append - Have you taken any oral treatment (medication) for raised total cholesterol? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['l3a']], axis = 1)
    cleaned_data.rename(columns={'l3a':'taken-cholesterol-oral-treatment'}, inplace=True)
    cleaned_data['taken-cholesterol-oral-treatment'] = pd.to_numeric(cleaned_data['taken-cholesterol-oral-treatment'], errors='coerce')
except KeyError:
    cleaned_data['taken-cholesterol-oral-treatment'] = np.nan

In [37]:
# append - Have you ever had a heart attack or chest pain from heart disease (angina) or a stroke (cerebrovascular accident or incident)? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['f1f']], axis = 1)
    cleaned_data.rename(columns={'f1f':'had-heart-attack'}, inplace=True)
    cleaned_data['had-heart-attack'] = pd.to_numeric(cleaned_data['had-heart-attack'], errors='coerce')
except KeyError:
    cleaned_data['had-heart-attack'] = np.nan

In [38]:
# append - Are you currently taking medication to prevent or treat heart disease? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['h18']], axis = 1)
    cleaned_data.rename(columns={'h18':'taking-heart-disease-medication'}, inplace=True)
    cleaned_data['taking-heart-disease-medication'] = pd.to_numeric(cleaned_data['taking-heart-disease-medication'], errors='coerce')
except KeyError:
    cleaned_data['taking-heart-disease-medication'] = np.nan

In [39]:
# append - Reading 1 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m11a']], axis = 1)
cleaned_data.rename(columns={'m11a':'reading1-systolic'}, inplace=True)
cleaned_data['reading1-systolic'] = pd.to_numeric(cleaned_data['reading1-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m11b']], axis = 1)
cleaned_data.rename(columns={'m11b':'reading1-diastolic'}, inplace=True)
cleaned_data['reading1-diastolic'] = pd.to_numeric(cleaned_data['reading1-diastolic'], errors='coerce')

In [40]:
# append - Reading 2 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m12a']], axis = 1)
cleaned_data.rename(columns={'m12a':'reading2-systolic'}, inplace=True)
cleaned_data['reading2-systolic'] = pd.to_numeric(cleaned_data['reading2-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m12b']], axis = 1)
cleaned_data.rename(columns={'m12b':'reading2-diastolic'}, inplace=True)
cleaned_data['reading2-diastolic'] = pd.to_numeric(cleaned_data['reading2-diastolic'], errors='coerce')

In [41]:
# append - Reading 3 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m13a']], axis = 1)
cleaned_data.rename(columns={'m13a':'reading3-systolic'}, inplace=True)
cleaned_data['reading3-systolic'] = pd.to_numeric(cleaned_data['reading3-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m13b']], axis = 1)
cleaned_data.rename(columns={'m13b':'reading3-diastolic'}, inplace=True)
cleaned_data['reading3-diastolic'] = pd.to_numeric(cleaned_data['reading3-diastolic'], errors='coerce')

In [42]:
# append - Have you been treated for raised blood pressure? - (category)
try:
    cleaned_data = pd.concat([cleaned_data, data['m14']], axis = 1)
    cleaned_data.rename(columns={'m14':'treated-for-raised-bp'}, inplace=True)
    cleaned_data['treated-for-raised-bp'] = pd.to_numeric(cleaned_data['treated-for-raised-bp'], errors='coerce')
except KeyError:
    cleaned_data['treated-for-raised-bp'] = np.nan

In [43]:
# append - For women: Are you pregnant? - (category)
cleaned_data = pd.concat([cleaned_data, data['m5']], axis = 1)
cleaned_data.rename(columns={'m5':'are-you-pregnant'}, inplace=True)
cleaned_data['are-you-pregnant'] = pd.to_numeric(cleaned_data['are-you-pregnant'], errors='coerce')

In [44]:
# append - Height - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m3']], axis = 1)
    cleaned_data.rename(columns={'m3':'height'}, inplace=True)
    cleaned_data['height'] = pd.to_numeric(cleaned_data['height'], errors='coerce')
except KeyError:
    cleaned_data['height'] = np.nan

In [45]:
# append - Weight - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m4']], axis = 1)
    cleaned_data.rename(columns={'m4':'weight'}, inplace=True)
    cleaned_data['weight'] = pd.to_numeric(cleaned_data['weight'], errors='coerce')
except KeyError:
    cleaned_data['weight'] = np.nan

In [46]:
# append - Waist circumference - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m7']], axis = 1)
    cleaned_data.rename(columns={'m7':'waist-circumference'}, inplace=True)
    cleaned_data['waist-circumference'] = pd.to_numeric(cleaned_data['waist-circumference'], errors='coerce')
except KeyError:
    cleaned_data['waist-circumference'] = np.nan

In [47]:
# append - hip circumference - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m15']], axis = 1)
    cleaned_data.rename(columns={'m15':'hip-circumference'}, inplace=True)
    cleaned_data['hip-circumference'] = pd.to_numeric(cleaned_data['hip-circumference'], errors='coerce')
except KeyError:
    cleaned_data['hip-circumference'] = np.nan

In [48]:
# append - Reading 1 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16a']], axis = 1)
    cleaned_data.rename(columns={'m16a':'reading1-bpm'}, inplace=True)
    cleaned_data['reading1-bpm'] = pd.to_numeric(cleaned_data['reading1-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading1-bpm'] = np.nan

In [49]:
# append - Reading 2 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16b']], axis = 1)
    cleaned_data.rename(columns={'m16b':'reading2-bpm'}, inplace=True)
    cleaned_data['reading2-bpm'] = pd.to_numeric(cleaned_data['reading2-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading2-bpm'] = np.nan

In [50]:
# append - Reading 3 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16c']], axis = 1)
    cleaned_data.rename(columns={'m16c':'reading3-bpm'}, inplace=True)
    cleaned_data['reading3-bpm'] = pd.to_numeric(cleaned_data['reading3-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading3-bpm'] = np.nan

In [51]:
# append - Fasting blood glucose - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b5']], axis = 1)
    cleaned_data.rename(columns={'b5':'fasting-blood-glucose'}, inplace=True)
    cleaned_data['fasting-blood-glucose'] = pd.to_numeric(cleaned_data['fasting-blood-glucose'], errors='coerce')
except KeyError:
    cleaned_data['fasting-blood-glucose'] = np.nan

In [52]:
# append - Total cholesterol - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b8']], axis = 1)
    cleaned_data.rename(columns={'b8':'total-cholesterol'}, inplace=True)
    cleaned_data['total-cholesterol'] = pd.to_numeric(cleaned_data['total-cholesterol'], errors='coerce')
except KeyError:
    cleaned_data['total-cholesterol'] = np.nan

In [53]:
# append - Urinary sodium - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b14']], axis = 1)
    cleaned_data.rename(columns={'b14':'urinary-sodium'}, inplace=True)
    cleaned_data['urinary-sodium'] = pd.to_numeric(cleaned_data['urinary-sodium'], errors='coerce')
except KeyError:
    cleaned_data['urinary-sodium'] = np.nan

In [54]:
# append - Urinary creatinine - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b15']], axis = 1)
    cleaned_data.rename(columns={'b15':'urinary-creatinine'}, inplace=True)
    cleaned_data['urinary-creatinine'] = pd.to_numeric(cleaned_data['urinary-creatinine'], errors='coerce')
except KeyError:
    cleaned_data['urinary-creatinine'] = np.nan

In [55]:
# append - Triglycerides - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b16']], axis = 1)
    cleaned_data.rename(columns={'b15':'triglycerides'}, inplace=True)
    cleaned_data['triglycerides'] = pd.to_numeric(cleaned_data['triglycerides'], errors='coerce')
except KeyError:
    cleaned_data['triglycerides'] = np.nan

In [56]:
# append - hdl-cholesterol - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b17']], axis = 1)
    cleaned_data.rename(columns={'b17':'hdl-cholesterol'}, inplace=True)
    cleaned_data['hdl-cholesterol'] = pd.to_numeric(cleaned_data['hdl-cholesterol'], errors='coerce')
except KeyError:
    cleaned_data['hdl-cholesterol'] = np.nan

In [57]:
cleaned_data.head(20)
# cleaned_data.tail(20)

Unnamed: 0,country,sex,age,years-at-school,level-of-education,marital-status,work-status,ppl-in-household,earnings-per-year,currently-smoke-tobacco,...,hip-circumference,reading1-bpm,reading2-bpm,reading3-bpm,fasting-blood-glucose,total-cholesterol,urinary-sodium,urinary-creatinine,triglycerides,hdl-cholesterol
0,bahamas,Women,40,12.0,5.0,2.0,1.0,2,,2,...,,86.0,91.0,95.0,,,,,,
1,bahamas,Women,37,18.0,7.0,2.0,1.0,2,95000.0,2,...,,87.0,82.0,81.0,,,,,,
2,bahamas,Men,49,15.0,7.0,2.0,2.0,2,64000.0,2,...,,75.0,87.0,75.0,,,,,,
3,bahamas,Women,31,16.0,7.0,1.0,1.0,1,22000.0,2,...,,77.0,91.0,95.0,,,,,,
4,bahamas,Men,58,16.0,7.0,1.0,3.0,1,,2,...,,75.0,79.0,76.0,,,,,,
5,bahamas,Women,36,14.0,7.0,2.0,1.0,2,,2,...,,112.0,81.0,115.0,,,,,,
6,bahamas,Men,37,12.0,5.0,1.0,8.0,1,,1,...,,79.0,82.0,80.0,,,,,,
7,bahamas,Women,36,12.0,5.0,1.0,1.0,4,78214.35,2,...,,64.0,63.0,68.0,,,,,,
8,bahamas,Women,39,12.0,5.0,2.0,3.0,2,,2,...,,84.0,85.0,101.0,,,,,,
9,bahamas,Women,48,12.0,6.0,1.0,1.0,3,,2,...,,72.0,73.0,67.0,,,,,,


In [58]:
cleaned_data.shape

(1643, 56)

In [59]:
cleaned_data.columns

Index(['country', 'sex', 'age', 'years-at-school', 'level-of-education',
       'marital-status', 'work-status', 'ppl-in-household',
       'earnings-per-year', 'currently-smoke-tobacco', 'age-started-smoking',
       'length-time-smoking', 'number-tobacco', 'type-tobacco',
       'age-stopped-smoking', 'smoke-home-workplace', 'consumed-alcohol',
       'quit-drinking-for-health', 'number-alcoholic-drinks',
       'number-daily-fruit-vegetables', 'salt-consumption', 'work-intensity',
       'days-vigorous-exercise', 'days-moderate-exercise',
       'time-walking-bicycling-minutes', 'time-sedentary',
       'had-blood-pressure-measurement', 'taken-drugs-for-raised-bp',
       'had-blood-sugar-measurement', 'taken-diabetes-drugs',
       'had-cholesterol-measurement', 'had-cholesterol-measurement',
       'taken-cholesterol-oral-treatment', 'had-heart-attack',
       'taking-heart-disease-medication', 'reading1-systolic',
       'reading1-diastolic', 'reading2-systolic', 'reading2-diasto

### Export cleaned data as csv

In [60]:
# cleaned_data.to_csv('C:\\Users\\georg\\Documents\\BPC Cleaned data\\Armenia.csv', index=False, encoding="utf-8") # windows george
cleaned_data.to_csv('../../cleaned_data/bahamas.csv', encoding='utf-8', index=False) # mac ekaba

# End