# Afghanistan

In [119]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [120]:
# load data
data = pd.read_csv('../../steps_dataset/afghanistan/afg2018.csv')

In [121]:
data.head()

Unnamed: 0,pid,agerange,wstep1,wstep2,wstep3,psu,stratum,urbanrural,province,district,...,b3,b4,b5,b6,b8,b9,b10,b13,b14,b15
0,57-24-1,18-29,7774.9404296875,7774.9404296875,8553.244140625,26,1,r,Kandahar,Dand,...,57,08:30:00.000+04:30,33,2,100,2,2,06:09:00.000+04:30,1.79769313486232e+308,1.79769313486232e+308
1,57-24-1,18-29,4895.3193359375,4895.3193359375,5385.361328125,151,1,u,Kandahar,KandaharCity,...,57,09:45:00.000+04:30,92,2,108,2,2,15:49:00.000+04:30,1.79769313486232e+308,1.79769313486232e+308
2,57-24-1,18-29,2591.64672851563,2591.64672851563,2851.08154296875,26,1,r,Kandahar,Dand,...,57,08:30:00.000+04:30,72,2,110,2,2,05:58:00.000+04:30,1.79769313486232e+308,1.79769313486232e+308
3,57-24-1,18-29,2591.64672851563,2591.64672851563,2851.08154296875,26,1,r,Kandahar,Dand,...,57,07:00:00.000+04:30,79,2,191,2,2,05:45:00.000+04:30,1.79769313486232e+308,1.79769313486232e+308
4,57-24-1,30-44,17998.78125,17998.78125,19218.142578125,144,1,u,Kandahar,KandaharCity,...,57,09:15:00.000+04:30,87,2,175,2,2,19:50:00.000+04:30,1.79769313486232e+308,1.79769313486232e+308


In [122]:
print('Basic Information:')
print(f'Number of rows: {len(data)}')
print(f'Number of columns: {data.shape[1]}')

Basic Information:
Number of rows: 3955
Number of columns: 191


In [123]:
# create final dataframe - this dataframe is to be appended to as we create the new features
cleaned_data = pd.DataFrame()

In [124]:
# append country
country = ['afghanistan'] * data.shape[0]
country = pd.DataFrame({'country': country})
cleaned_data = pd.concat([cleaned_data, country], axis = 1)

In [125]:
# append sex - (category)
cleaned_data = pd.concat([cleaned_data, data['sex']], axis = 1)

In [126]:
# append age - (numeric)
cleaned_data = pd.concat([cleaned_data, data['age']], axis = 1)
cleaned_data['age'] = pd.to_numeric(cleaned_data['age'], errors='coerce')

In [127]:
# append years-at-school - (numeric)
cleaned_data = pd.concat([cleaned_data, data['c4']], axis = 1)
cleaned_data.rename(columns={'c4':'years-at-school'}, inplace=True)
cleaned_data['years-at-school'] = pd.to_numeric(cleaned_data['years-at-school'], errors='coerce')

In [128]:
# append level-of-education - (category)
cleaned_data = pd.concat([cleaned_data, data['c5']], axis = 1)
cleaned_data.rename(columns={'c5':'level-of-education'}, inplace=True)

In [129]:
# append marital-status - (category)
cleaned_data = pd.concat([cleaned_data, data['c7']], axis = 1)
cleaned_data.rename(columns={'c7':'marital-status'}, inplace=True)

In [130]:
# append work-status - (category)
cleaned_data = pd.concat([cleaned_data, data['c8']], axis = 1)
cleaned_data.rename(columns={'c8':'work-status'}, inplace=True)

In [131]:
# append ppl-in-household - (numeric)
cleaned_data = pd.concat([cleaned_data, data['c9']], axis = 1)
cleaned_data.rename(columns={'c9':'ppl-in-household'}, inplace=True)
cleaned_data['ppl-in-household'] = pd.to_numeric(cleaned_data['ppl-in-household'], errors='coerce')

In [132]:
# append earnings per year - (numeric)
earnings_per_year = []
for index, row in data.iterrows():
    if row['c10type'] == 'month':
        earnings_per_year.append(int(row['c10']) * 12)
    elif row['c10type'] == 'year':
        earnings_per_year.append(int(row['c10']))
    else:
        earnings_per_year.append(np.nan)
        
earnings_per_year = pd.DataFrame({'earnings-per-year': earnings_per_year})
cleaned_data = pd.concat([cleaned_data, earnings_per_year], axis = 1)
cleaned_data['earnings-per-year'] = pd.to_numeric(cleaned_data['earnings-per-year'], errors='coerce')

In [133]:
# append - Do you currently smoke tobacco products? - (category)
cleaned_data = pd.concat([cleaned_data, data['t1']], axis = 1)
cleaned_data.rename(columns={'t1':'currently-smoke-tobacco'}, inplace=True)

In [134]:
# append - How old were you when you first started smoking? - (numeric)
cleaned_data = pd.concat([cleaned_data, data['t3']], axis = 1)
cleaned_data.rename(columns={'t3':'age-started-smoking'}, inplace=True)
cleaned_data['age-started-smoking'] = pd.to_numeric(cleaned_data['age-started-smoking'], errors='coerce')

In [135]:
# append - Length of time smoking - (numeric)
cleaned_data = pd.concat([cleaned_data, data['t4']], axis = 1)
cleaned_data.rename(columns={'t4':'length-time-smoking'}, inplace=True)
cleaned_data['length-time-smoking'] = pd.to_numeric(cleaned_data['length-time-smoking'], errors='coerce')

In [136]:
# append - Number of smoke/ smokeless per day (tobacco products) - (numeric)
# append - What type of tobacco do you use? - (category)
number_tobacco = []
type_tobacco = []

for index, row in data.iterrows():
    tobacco = []
    tobacco.append(int(row['t5a'])) if row['t5a'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5aw']) / 7, 0)) if row['t5aw'].isdigit() else  tobacco.append(0)
    tobacco.append(int(row['t5b'])) if row['t5b'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5bw']) / 7, 0)) if row['t5bw'].isdigit() else  tobacco.append(0)
    tobacco.append(int(row['t5c'])) if row['t5c'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5cw']) / 7, 0)) if row['t5cw'].isdigit() else  tobacco.append(0)
    tobacco.append(int(row['t5d'])) if row['t5d'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5dw']) / 7, 0)) if row['t5dw'].isdigit() else  tobacco.append(0)
    tobacco.append(int(row['t5e'])) if row['t5e'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5ew']) / 7, 0)) if row['t5ew'].isdigit() else  tobacco.append(0)
    tobacco.append(int(row['t5f'])) if row['t5f'].isdigit() else  tobacco.append(0)
    tobacco.append(round(int(row['t5fw']) / 7, 0)) if row['t5fw'].isdigit() else  tobacco.append(0)
    
    number_tobacco.append(max(tobacco)) if max(tobacco) != 0 else number_tobacco.append(np.nan)
    
    # type of tobacco
    index = tobacco.index(max(tobacco)) if max(tobacco) != 0 else 99
    
    if index == 0 or index == 1:
        type_tobacco.append('manufactured cigarettes')
    elif index == 2 or index == 3:
        type_tobacco.append('hand-rolled cigarettes')
    elif index == 4 or index == 5:
        type_tobacco.append('tobacco pipes')
    elif index == 6 or index == 7:
        type_tobacco.append('cigars, cheroots, cigarillos')
    elif index == 8 or index == 9:
        type_tobacco.append('shisha')
    elif index == 10 or index == 11:
        type_tobacco.append('other tobacco product')
    else:
        type_tobacco.append('none')
        
# append number of tobacco
number_tobacco = pd.DataFrame({'number-tobacco': number_tobacco})
cleaned_data = pd.concat([cleaned_data, number_tobacco], axis = 1)
cleaned_data['number-tobacco'] = pd.to_numeric(cleaned_data['number-tobacco'], errors='coerce')

# append type of robacco
type_tobacco = pd.DataFrame({'type-tobacco': type_tobacco})
cleaned_data = pd.concat([cleaned_data, type_tobacco], axis = 1)


In [137]:
# append - How old when you stopped smoking (weeks) - (numeric)
age_stopped_smoking = []
for index, row in data.iterrows():    
    if row['t10'].isdigit():
        age_stopped_smoking.append(int(row['t10'])) if int(row['t10']) != 77 else age_stopped_smoking.append(np.nan)
    else:
        age_stopped_smoking.append(np.nan)
            
        
age_stopped_smoking = pd.DataFrame({'age-stopped-smoking': age_stopped_smoking})
cleaned_data = pd.concat([cleaned_data, age_stopped_smoking], axis = 1)
cleaned_data['age-stopped-smoking'] = pd.to_numeric(cleaned_data['age-stopped-smoking'], errors='coerce')

In [138]:
# append - Smoke in home or workplace? - (category)
smoke_home_workplace = []
for index, row in data.iterrows():
    if row['t17'].isdigit() or row['t17'].isdigit():
        if int(row['t17']) == 1 or int(row['t18']) == 1:
            smoke_home_workplace.append(1)
        else:
            smoke_home_workplace.append(2)
    else:
        smoke_home_workplace.append(np.nan)
        
smoke_home_workplace = pd.DataFrame({'smoke-home-workplace': smoke_home_workplace})
cleaned_data = pd.concat([cleaned_data, smoke_home_workplace], axis = 1)

In [139]:
# append - How you ever consumed alcohol? - (category)
consumed_alcohol = []
for index, row in data.iterrows():
    if row['a1'].isdigit():
        consumed_alcohol.append(int(row['a1']))
    else:
        consumed_alcohol.append(np.nan)
        
consumed_alcohol = pd.DataFrame({'consumed-alcohol': consumed_alcohol})
cleaned_data = pd.concat([cleaned_data, consumed_alcohol], axis = 1)

In [140]:
# append - Have you stopped drinking due to health reasons? - (category)
quit_drinking_for_health = []
for index, row in data.iterrows():
    if row['a3'].isdigit():
        quit_drinking_for_health.append(int(row['a3']))
    else:
        quit_drinking_for_health.append(np.nan)
        
quit_drinking_for_health = pd.DataFrame({'quit-drinking-for-health': quit_drinking_for_health})
cleaned_data = pd.concat([cleaned_data, quit_drinking_for_health], axis = 1)

In [141]:
# append - How many alcoholic drinks do you consume per day? - (category)
number_alcoholic_drinks = []
for index, row in data.iterrows():
    if row['a4'].isdigit():
        number_alcoholic_drinks.append(int(row['a4']))
    else:
        number_alcoholic_drinks.append(np.nan)
        
number_alcoholic_drinks = pd.DataFrame({'number-alcoholic-drinks': number_alcoholic_drinks})
cleaned_data = pd.concat([cleaned_data, number_alcoholic_drinks], axis = 1)

In [142]:
# append - How many fruit/ vegetables do you eat per day? - (numeric)
number_daily_fruit_vegetables = []
for index, row in data.iterrows():
    fruit = 0
    vegetable = 0
    
    # fruit
    if row['d1'].isdigit():
        if row['d2'].isdigit():
            fruit = int(row['d1']) * int(row['d2'])
    else:
            fruit = 0
    
    # vegetables
    if row['d3'].isdigit():
        if row['d4'].isdigit():
            vegetable = int(row['d3']) * int(row['d4'])
    else:
            vegetable = 0
            
    number_daily_fruit_vegetables.append(fruit + vegetable)
        
number_daily_fruit_vegetables = pd.DataFrame({'number-daily-fruit-vegetables': number_daily_fruit_vegetables})
cleaned_data = pd.concat([cleaned_data, number_daily_fruit_vegetables], axis = 1)

In [143]:
# append - How much salt or salty sauce do you think you consume?  - (category)
cleaned_data = pd.concat([cleaned_data, data['d8']], axis = 1)
cleaned_data.rename(columns={'d8':'salt-consumption'}, inplace=True)

In [144]:
# append - How intense is your work [vigorous-intensity, moderate-intensity, none]? - (cateogry)
work_intensity = []
for index, row in data.iterrows():
    if row['p1'].isdigit() or row['p4'].isdigit():
        if row['p1'].isdigit() and int(row['p1']) == 1:
            work_intensity.append('vigorous-intensity')
        elif row['p4'].isdigit() and int(row['p4']) == 1 and int(row['p1']) != 1:
            work_intensity.append('moderate-intensity')
        else:
             work_intensity.append('none')
    else:
        work_intensity.append(np.nan)
        
work_intensity = pd.DataFrame({'work-intensity': work_intensity})
cleaned_data = pd.concat([cleaned_data, work_intensity], axis = 1)

In [145]:
# append - In a typical week, on how many days do you do moderate or vigorous-intensity activities as part of your work? - (numeric)
days_vigorous_exercise = []
cleaned_data = pd.concat([cleaned_data, data['p2']], axis = 1)
cleaned_data.rename(columns={'p2':'days-vigorous-exercise'}, inplace=True)
cleaned_data['days-vigorous-exercise'] = pd.to_numeric(cleaned_data['days-vigorous-exercise'], errors='coerce')

days_moderate_exercise = []
cleaned_data = pd.concat([cleaned_data, data['p5']], axis = 1)
cleaned_data.rename(columns={'p5':'days-moderate-exercise'}, inplace=True)
cleaned_data['days-moderate-exercise'] = pd.to_numeric(cleaned_data['days-moderate-exercise'], errors='coerce')

In [146]:
# append - How much time do you spend walking or bicycling for travel on a typical day? - (numeric)
time_walking_bicycling_minutes = []

for index, row in data.iterrows():
    if row['p9a'].isdigit() or row['p9b'].isdigit():
        hour_to_minute = minute = 0
         
        if row['p9a'].isdigit():
            hour_to_minute = int(row['p9a']) * 60
        elif row['p9b'].isdigit():
            minute = int(row['p9b'])
        
        time_walking_bicycling_minutes.append(hour_to_minute + minute)
    else:
        time_walking_bicycling_minutes.append(np.nan)
        
time_walking_bicycling_minutes = pd.DataFrame({'time-walking-bicycling-minutes': time_walking_bicycling_minutes})
cleaned_data = pd.concat([cleaned_data, time_walking_bicycling_minutes], axis = 1)

In [147]:
# append - How much time do you usually spend sitting or reclining on a typical day? - (numeric)
time_sedentary = []

for index, row in data.iterrows():
    if row['p16a'].isdigit() or row['p16b'].isdigit():
        hour_to_minute = minute = 0
         
        if row['p16a'].isdigit():
            hour_to_minute = int(row['p16a']) * 60
        elif row['p16b'].isdigit():
            minute = int(row['p16b'])
        
        time_sedentary.append(hour_to_minute + minute)
    else:
        time_sedentary.append(np.nan)
        
time_sedentary = pd.DataFrame({'time-sedentary': time_sedentary})
cleaned_data = pd.concat([cleaned_data, time_sedentary], axis = 1)

In [148]:
# append - Have you ever had your blood pressure measured by a doctor or other health worker? - (category)
cleaned_data = pd.concat([cleaned_data, data['h1']], axis = 1)
cleaned_data.rename(columns={'h1':'had-blood-pressure-measurement'}, inplace=True)

In [149]:
# append - Have you taken any drugs (medication) for raised blood pressure? - (category)
cleaned_data = pd.concat([cleaned_data, data['h3']], axis = 1)
cleaned_data.rename(columns={'h3':'taken-drugs-for-raised-bp'}, inplace=True)
cleaned_data['taken-drugs-for-raised-bp'] = pd.to_numeric(cleaned_data['taken-drugs-for-raised-bp'], errors='coerce')

In [150]:
# append - Have you ever had your blood sugar measured by a doctor or other health worker? - (category)
cleaned_data = pd.concat([cleaned_data, data['h6']], axis = 1)
cleaned_data.rename(columns={'h6':'had-blood-sugar-measurement'}, inplace=True)
cleaned_data['had-blood-sugar-measurement'] = pd.to_numeric(cleaned_data['had-blood-sugar-measurement'], errors='coerce')

In [151]:
# append - Have you taken any drugs (medication) for diabetes? - (category)
cleaned_data = pd.concat([cleaned_data, data['h8']], axis = 1)
cleaned_data.rename(columns={'h8':'taken-diabetes-drugs'}, inplace=True)
cleaned_data['taken-diabetes-drugs'] = pd.to_numeric(cleaned_data['taken-diabetes-drugs'], errors='coerce')

In [152]:
# append - Have you ever had your cholesterol (fat levels in your blood) measured by a doctor or other health worker? - (category)
cleaned_data = pd.concat([cleaned_data, data['h12']], axis = 1)
cleaned_data.rename(columns={'h12':'had-cholesterol-measurement'}, inplace=True)
cleaned_data['had-cholesterol-measurement'] = pd.to_numeric(cleaned_data['had-cholesterol-measurement'], errors='coerce')

In [153]:
# append - Have you taken any oral treatment (medication) for raised total cholesterol? - (category)
cleaned_data = pd.concat([cleaned_data, data['h14']], axis = 1)
cleaned_data.rename(columns={'h14':'taken-cholesterol-oral-treatment'}, inplace=True)
cleaned_data['taken-cholesterol-oral-treatment'] = pd.to_numeric(cleaned_data['taken-cholesterol-oral-treatment'], errors='coerce')

In [154]:
# append - Have you ever had a heart attack or chest pain from heart disease (angina) or a stroke (cerebrovascular accident or incident)? - (category)
cleaned_data = pd.concat([cleaned_data, data['h17']], axis = 1)
cleaned_data.rename(columns={'h17':'had-heart-attack'}, inplace=True)
cleaned_data['had-heart-attack'] = pd.to_numeric(cleaned_data['had-heart-attack'], errors='coerce')

In [155]:
# append - Are you currently taking medication to prevent or treat heart disease? - (category)
cleaned_data = pd.concat([cleaned_data, data['h18']], axis = 1)
cleaned_data.rename(columns={'h18':'taking-heart-disease-medication'}, inplace=True)
cleaned_data['taking-heart-disease-medication'] = pd.to_numeric(cleaned_data['taking-heart-disease-medication'], errors='coerce')

In [156]:
# append - Reading 1 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m4a']], axis = 1)
cleaned_data.rename(columns={'m4a':'reading1-systolic'}, inplace=True)
cleaned_data['reading1-systolic'] = pd.to_numeric(cleaned_data['reading1-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m4b']], axis = 1)
cleaned_data.rename(columns={'m4b':'reading1-diastolic'}, inplace=True)
cleaned_data['reading1-diastolic'] = pd.to_numeric(cleaned_data['reading1-diastolic'], errors='coerce')

In [157]:
# append - Reading 2 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m5a']], axis = 1)
cleaned_data.rename(columns={'m5a':'reading2-systolic'}, inplace=True)
cleaned_data['reading2-systolic'] = pd.to_numeric(cleaned_data['reading2-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m5b']], axis = 1)
cleaned_data.rename(columns={'m5b':'reading2-diastolic'}, inplace=True)
cleaned_data['reading2-diastolic'] = pd.to_numeric(cleaned_data['reading2-diastolic'], errors='coerce')

In [158]:
# append - Reading 3 [Systolic, Diastolic] - (numeric)

# systolic
cleaned_data = pd.concat([cleaned_data, data['m6a']], axis = 1)
cleaned_data.rename(columns={'m6a':'reading3-systolic'}, inplace=True)
cleaned_data['reading3-systolic'] = pd.to_numeric(cleaned_data['reading3-systolic'], errors='coerce')

# diastolic
cleaned_data = pd.concat([cleaned_data, data['m6b']], axis = 1)
cleaned_data.rename(columns={'m6b':'reading3-diastolic'}, inplace=True)
cleaned_data['reading3-diastolic'] = pd.to_numeric(cleaned_data['reading3-diastolic'], errors='coerce')

In [159]:
# append - Have you been treated for raised blood pressure? - (category)
cleaned_data = pd.concat([cleaned_data, data['m7']], axis = 1)
cleaned_data.rename(columns={'m7':'treated-for-raised-bp'}, inplace=True)
cleaned_data['treated-for-raised-bp'] = pd.to_numeric(cleaned_data['treated-for-raised-bp'], errors='coerce')

In [160]:
# append - For women: Are you pregnant? - (category)
cleaned_data = pd.concat([cleaned_data, data['m8']], axis = 1)
cleaned_data.rename(columns={'m8':'are-you-pregnant'}, inplace=True)
cleaned_data['are-you-pregnant'] = pd.to_numeric(cleaned_data['are-you-pregnant'], errors='coerce')

In [161]:
# append - Height - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m11']], axis = 1)
    cleaned_data.rename(columns={'m11':'height'}, inplace=True)
    cleaned_data['height'] = pd.to_numeric(cleaned_data['height'], errors='coerce')
except KeyError:
    cleaned_data['height'] = np.nan

In [162]:
# append - Weight - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m12']], axis = 1)
    cleaned_data.rename(columns={'m12':'weight'}, inplace=True)
    cleaned_data['weight'] = pd.to_numeric(cleaned_data['weight'], errors='coerce')
except KeyError:
    cleaned_data['weight'] = np.nan

In [163]:
# append - Waist circumference - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m14']], axis = 1)
    cleaned_data.rename(columns={'m14':'waist-circumference'}, inplace=True)
    cleaned_data['waist-circumference'] = pd.to_numeric(cleaned_data['waist-circumference'], errors='coerce')
except KeyError:
    cleaned_data['waist-circumference'] = np.nan

In [164]:
# append - Waist circumference - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m15']], axis = 1)
    cleaned_data.rename(columns={'m15':'hip-circumference'}, inplace=True)
    cleaned_data['hip-circumference'] = pd.to_numeric(cleaned_data['hip-circumference'], errors='coerce')
except KeyError:
    cleaned_data['hip-circumference'] = np.nan

In [165]:
# append - Reading 1 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16a']], axis = 1)
    cleaned_data.rename(columns={'m16a':'reading1-bpm'}, inplace=True)
    cleaned_data['reading1-bpm'] = pd.to_numeric(cleaned_data['reading1-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading1-bpm'] = np.nan

In [166]:
# append - Reading 2 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16b']], axis = 1)
    cleaned_data.rename(columns={'m16b':'reading2-bpm'}, inplace=True)
    cleaned_data['reading2-bpm'] = pd.to_numeric(cleaned_data['reading2-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading2-bpm'] = np.nan

In [167]:
# append - Reading 3 (beats per minute) - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['m16c']], axis = 1)
    cleaned_data.rename(columns={'m16c':'reading3-bpm'}, inplace=True)
    cleaned_data['reading3-bpm'] = pd.to_numeric(cleaned_data['reading3-bpm'], errors='coerce')
except KeyError:
    cleaned_data['reading3-bpm'] = np.nan

In [168]:
# append - Fasting blood glucose - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b5']], axis = 1)
    cleaned_data.rename(columns={'b5':'fasting-blood-glucose'}, inplace=True)
    cleaned_data['fasting-blood-glucose'] = pd.to_numeric(cleaned_data['fasting-blood-glucose'], errors='coerce')
except KeyError:
    cleaned_data['fasting-blood-glucose'] = np.nan

In [169]:
# append - Total cholesterol - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b8']], axis = 1)
    cleaned_data.rename(columns={'b8':'total-cholesterol'}, inplace=True)
    cleaned_data['total-cholesterol'] = pd.to_numeric(cleaned_data['total-cholesterol'], errors='coerce')
except KeyError:
    cleaned_data['total-cholesterol'] = np.nan

In [170]:
# append - Urinary sodium - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b14']], axis = 1)
    cleaned_data.rename(columns={'b14':'urinary-sodium'}, inplace=True)
    cleaned_data['urinary-sodium'] = pd.to_numeric(cleaned_data['urinary-sodium'], errors='coerce')
except KeyError:
    cleaned_data['urinary-sodium'] = np.nan

In [171]:
# append - Urinary creatinine - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b15']], axis = 1)
    cleaned_data.rename(columns={'b15':'urinary-creatinine'}, inplace=True)
    cleaned_data['urinary-creatinine'] = pd.to_numeric(cleaned_data['urinary-creatinine'], errors='coerce')
except KeyError:
    cleaned_data['urinary-creatinine'] = np.nan

In [172]:
# append - Triglycerides - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b16']], axis = 1)
    cleaned_data.rename(columns={'b16':'triglycerides'}, inplace=True)
    cleaned_data['triglycerides'] = pd.to_numeric(cleaned_data['triglycerides'], errors='coerce')
except KeyError:
    cleaned_data['triglycerides'] = np.nan

In [173]:
# append - Urinary creatinine - (numeric)
try:
    cleaned_data = pd.concat([cleaned_data, data['b17']], axis = 1)
    cleaned_data.rename(columns={'b17':'hdl-cholesterol'}, inplace=True)
    cleaned_data['hdl-cholesterol'] = pd.to_numeric(cleaned_data['hdl-cholesterol'], errors='coerce')
except KeyError:
    cleaned_data['hdl-cholesterol'] = np.nan

In [174]:
cleaned_data.head(20)
# cleaned_data.tail(20)

Unnamed: 0,country,sex,age,years-at-school,level-of-education,marital-status,work-status,ppl-in-household,earnings-per-year,currently-smoke-tobacco,...,hip-circumference,reading1-bpm,reading2-bpm,reading3-bpm,fasting-blood-glucose,total-cholesterol,urinary-sodium,urinary-creatinine,triglycerides,hdl-cholesterol
0,afghanistan,Women,20.0,0.0,1,2,6,6.0,,2,...,,,,,33.0,100.0,,,,
1,afghanistan,Women,22.0,0.0,1,2,6,2.0,60000.0,2,...,,,,,92.0,108.0,,,,
2,afghanistan,Women,23.0,0.0,1,2,6,10.0,,2,...,,,,,72.0,110.0,,,,
3,afghanistan,Women,24.0,0.0,1,2,6,2.0,168000.0,2,...,,,,,79.0,191.0,,,,
4,afghanistan,Women,38.0,0.0,1,5,6,3.0,120000.0,2,...,,,,,87.0,175.0,,,,
5,afghanistan,Men,25.0,0.0,1,2,3,6.0,120000.0,2,...,,,,,77.0,166.0,,,,
6,afghanistan,Men,19.0,0.0,1,2,3,6.0,180000.0,2,...,,,,,105.0,172.0,,,,
7,afghanistan,Men,28.0,0.0,1,2,3,6.0,20000.0,2,...,,,,,100.0,112.0,,,,
8,afghanistan,Women,40.0,0.0,1,2,2,5.0,108000.0,2,...,,,,,83.0,147.0,,,,
9,afghanistan,Men,21.0,11.0,4,2,3,2.0,120000.0,2,...,,,,,128.0,126.0,,,,


In [175]:
cleaned_data.shape

(3955, 55)

In [176]:
cleaned_data.columns

Index(['country', 'sex', 'age', 'years-at-school', 'level-of-education',
       'marital-status', 'work-status', 'ppl-in-household',
       'earnings-per-year', 'currently-smoke-tobacco', 'age-started-smoking',
       'length-time-smoking', 'number-tobacco', 'type-tobacco',
       'age-stopped-smoking', 'smoke-home-workplace', 'consumed-alcohol',
       'quit-drinking-for-health', 'number-alcoholic-drinks',
       'number-daily-fruit-vegetables', 'salt-consumption', 'work-intensity',
       'days-vigorous-exercise', 'days-moderate-exercise',
       'time-walking-bicycling-minutes', 'time-sedentary',
       'had-blood-pressure-measurement', 'taken-drugs-for-raised-bp',
       'had-blood-sugar-measurement', 'taken-diabetes-drugs',
       'had-cholesterol-measurement', 'taken-cholesterol-oral-treatment',
       'had-heart-attack', 'taking-heart-disease-medication',
       'reading1-systolic', 'reading1-diastolic', 'reading2-systolic',
       'reading2-diastolic', 'reading3-systolic', 'rea

### Export cleaned data as csv

In [177]:
cleaned_data.to_csv('../../cleaned_data/afghanistan.csv', encoding='utf-8', index=False)

# End