### Import Python Libraries

In [1]:
import requests
import lxml.html as lh
import pandas as pd

In [2]:
# Store website in a url variable
url = "https://developers.google.com/public-data/docs/canonical/countries_csv"

In [3]:
# Use the request method on the url
req = requests.get(url)

In [4]:
# Store the contents of the website using the html lxml.html parser lh
doc = lh.fromstring(req.content)

In [5]:
#Parse data that is stored between HTML table row tags <tr>..</tr>
tr_elements = doc.xpath('//tr')

### Quality Control

In [6]:
# Confirm that you are gathering tabular data
# You can do that by inspecting the length of 
# the rows by using a list comprehension
# make sure that each row has the same number of columns
[len(T) for T in tr_elements[:10]]

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

### Save the header row

In [7]:
#Create empty list
table_data=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1

    #lxlm text_content() method extracts text values
    name=t.text_content()
    #Check you are gathering the header information with print statement
    #This is optional   
    print (i,name) 
    
    #use the List .append() method to add the text from each row
    #into the empty list col you created
    table_data.append((name,[]))

1 country
2 latitude
3 longitude
4 name


### Save the data rows

In [10]:
# The header data was stored in the first row 
# From above code(index 0 = tr_elements[0])
# data is stored on the second row onwards
# Use a for loop to eterate through the reamining tr elements
# Make sure that each row is tabular, if not break out
# For tabular data (row with equal columms) 
# Store the data into your table_data list.
for j in range(1,len(tr_elements)):
    
    #T is our j'th row
    T=tr_elements[j]
    
    #Count the number of columns in the table and assign to variable
    #In this example, we have 28 columns with data     
    col_nbr = 4
    
    #If row is not of size 28 (# columns), 
    #the //tr data is not from our table 
    if len(T)!=col_nbr:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    #This for loop uses the lxlm .iterchildren() method
    for t in T.iterchildren():
        #This code uses the lxlm .text_content() method         
        data=t.text_content() 
        
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        table_data[i][1].append(data)
        #Increment i for the next column
        i+=1

In [11]:
# Quality control - confirm that you gathered columns with equal rows
[len(C) for (title,C) in table_data]

[245, 245, 245, 245]

### Save data into a pandas dataframe

In [12]:
# Create the dataframe
Dict={title:column for (title,column) in table_data}
df=pd.DataFrame(Dict)

In [13]:
# Inspect dataframe
df.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [14]:
df.columns = ["Country", 'Lat', 'Lng', 'Country_Name']

### Save dataframe to csv file

In [15]:
df.to_csv("countries_coordinate.csv", encoding='utf-8', index=False)