In [1]:
#Coursera - Applied Data Science Capstone Project
#Topic: What determines an adult's physical height?

#import libraries
import numpy as np
import pandas as pd
import urllib.request # import the library we use to open URLs
import requests # library to handle requests
from bs4 import BeautifulSoup # import the BeautifulSoup library so we can parse HTML and XML documents

In [2]:
#First, find the top 10 tallest and top 10 shortest countries via aggregating and averaging multiple data sources - men and women
url= "http://www.averageheight.co/average-male-height-by-country"
page=urllib.request.urlopen(url)
soup= BeautifulSoup(page, "lxml")# parse the HTML from our URL into the BeautifulSoup parse tree format
table= soup.find_all('table')[0] # Grab the first table

In [3]:
def tableDataText(table):    
    """Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

In [4]:
htmltable = soup.find('table', { 'class' : 'display' })
list_table = tableDataText(htmltable)
list_table[:2]
df_AvgHeightCo = pd.DataFrame(list_table[1:], columns=list_table[0]) #dataframe for URL: averageheight.co

In [5]:
df_AvgHeightCo["Height (in)"]="" #add column for height in inches

In [6]:
df_AvgHeightCo["Height (m)"]=df_AvgHeightCo["Height (m)"].astype(float) #convert objects in column to float
df_AvgHeightCo["Height (in)"]=39.37007874*df_AvgHeightCo["Height (m)"] #39.37007874 inches in 1 meter - convert to inches since it is a US study

In [7]:
df_AvgHeightCo.head()

Unnamed: 0,Country,Continent,Height (m),Height (ft),Height (in)
0,Albania,Europe,1.74,"5' 8.5""",68.503937
1,Algeria,Africa,1.722,"5' 7.75""",67.795276
2,Argentina,South America,1.745,"5' 8.5""",68.700787
3,Australia,Oceania,1.756,"5' 9""",69.133858
4,Austria,Europe,1.792,"5'10.5""",70.551181


In [8]:
df_AvgHeightCo=df_AvgHeightCo.drop(['Continent','Height (m)','Height (ft)'], axis=1) #drop columns that provide no valuable insight to the study

In [9]:
decimals = 2    
df_AvgHeightCo['Height (in)'] = df_AvgHeightCo['Height (in)'].apply(lambda x: round(x, decimals)) #round the decimals in height (in)
df_AvgHeightCo.head() #dataframe 1 on average country height complete - now to compile other dataframes

Unnamed: 0,Country,Height (in)
0,Albania,68.5
1,Algeria,67.8
2,Argentina,68.7
3,Australia,69.13
4,Austria,70.55


In [10]:
url2 = "http://www.ncdrisc.org/downloads/height/NCD_RisC_eLife_2016_height_age18_countries.csv"
df=pd.read_csv(url2)

In [11]:
array= ['1996']
df_1996 = df.loc[df['Year of birth'].isin(array)]
df_men = df_1996.loc[df['Sex'] == 'Men']

In [12]:
df2=df_men.drop(['ISO','Mean height lower 95% uncertainty interval (cm)', "Mean height upper 95% uncertainty interval (cm)"], axis=1) 
df3=df2.drop(['Sex','Year of birth'], axis=1)

In [13]:
df3["Height (in)"]=""
df3["Height (in)"]=0.3937007874*df3["Mean height (cm)"]
decimals1 = 2    
df3['Height (in)'] = df3['Height (in)'].apply(lambda x: round(x, decimals1))
df_ncdrisc = df3.drop(['Mean height (cm)'], axis=1)#dataframe 2 on average country height complete - one more to go

In [21]:
#Third data source on average height from World data
from urllib.request import Request, urlopen
url2= "http://www.worlddata.info/average-bodyheight.php#by-population"
req = Request(url2 , headers={'User-Agent': 'Google Chrome/85.0.4183.83'})

webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]><html class="no-js lt-ie10 lt-ie9 lt-ie8 lt-ie7 "> <![endif]-->
<!--[if IE 7]><html class="no-js lt-ie10 lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]><html class="no-js lt-ie10 lt-ie9"> <![endif]-->
<!--[if IE 9]><html class="no-js lt-ie10"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <!-- Basic Page Needs -->
  <meta charset="utf-8"/>
  <!-- Mobile Specific Metas -->
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <!-- Favicons -->
  <link href="http://d28u20f0bzkbnx.cloudfront.net/favicon.ico" rel="shortcut icon"/>
  <link href="assets/avg_hght/slide-menu/slidebars.css" rel="stylesheet"/>
  <link href="assets/avg_hght/slide-menu/style.css" rel="stylesheet"/>
  <link href="http://www.averageheight.co/assets/avg_hght/imgs/avghght_57x57.png" rel="apple-touch-icon"/>
  <script type="text/javascript">
   // RELOADS WEBPAGE WHEN MOBILE ORIENTATION CHANGES	
		    win

In [19]:
table3= soup.find_all('table')[0]
table3

<table cellspacing="0" class="display" id="mens-height" width="100%">
<thead>
<tr>
<th>Country</th>
<th>Continent</th>
<th>Height (m)</th>
<th>Height (ft)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Albania</td>
<td>Europe</td>
<td>1.740</td>
<td>5' 8.5"</td>
</tr>
<tr>
<td>Algeria</td>
<td>Africa</td>
<td>1.722</td>
<td>5' 7.75"</td>
</tr>
<tr>
<td>Argentina</td>
<td>South America</td>
<td>1.745</td>
<td>5' 8.5"</td>
</tr>
<tr>
<td>Australia</td>
<td>Oceania</td>
<td>1.756</td>
<td>5' 9"</td>
</tr>
<tr>
<td>Austria</td>
<td>Europe</td>
<td>1.792</td>
<td>5'10.5"</td>
</tr>
<tr>
<td>Azerbaijan</td>
<td>Asia</td>
<td>1.718</td>
<td>5' 7.75"</td>
</tr>
<tr>
<td>Bahrain</td>
<td>Asia</td>
<td>1.651</td>
<td>5' 5"</td>
</tr>
<tr>
<td>Belgium</td>
<td>Europe</td>
<td>1.786</td>
<td>5'10.25"</td>
</tr>
<tr>
<td>Bolivia</td>
<td>South America</td>
<td>1.600</td>
<td>5' 3"</td>
</tr>
<tr>
<td>Brazil</td>
<td>South America</td>
<td>1.731</td>
<td>5' 8.25"</td>
</tr>
<tr>
<td>Bosnia &amp; Herzegovina</

In [18]:
htmltable3 = soup.find('table3', { 'class' : 'display' })
list_table3 = tableDataText(htmltable3)
df9 = pd.DataFrame(list_table3[1:], columns=list_table3[0]) #dataframe for URL: worlddata.info
df9.sort_values('Country')

Unnamed: 0,Country,Continent,Height (m),Height (ft)
0,Albania,Europe,1.740,"5' 8.5"""
1,Algeria,Africa,1.722,"5' 7.75"""
2,Argentina,South America,1.745,"5' 8.5"""
3,Australia,Oceania,1.756,"5' 9"""
4,Austria,Europe,1.792,"5'10.5"""
5,Azerbaijan,Asia,1.718,"5' 7.75"""
6,Bahrain,Asia,1.651,"5' 5"""
7,Belgium,Europe,1.786,"5'10.25"""
8,Bolivia,South America,1.600,"5' 3"""
10,Bosnia & Herzegovina,Europe,1.839,"6' 0.5"""
