In [1]:
using HTTP # requests the HTML information from the web server
using Gumbo # parses HTML data
using Cascadia # to select all the elements using selectors
using DataFrames #helps to work with a tabular data
using WorldBankData #provides the global development data, it is used for country codes
using CSV # exporting the dataframe into csv format 

##### https://www.oecd.org/about/members-and-partners/ to collect the 38 member countries

In [2]:
url = "https://www.oecd.org/about/members-and-partners/"

"https://www.oecd.org/about/members-and-partners/"

In [3]:
request = HTTP.get(url)

HTTP.Messages.Response:
"""
HTTP/1.1 200 OK
Content-Type: text/html
Last-Modified: Sat, 09 Oct 2021 09:54:09 GMT
ETag: "80f6be9af3bcd71:0"
Server: Microsoft-IIS/10.0
X-Powered-By: ASP.NET
X-Frame-Options: SAMEORIGIN
X-UA-Compatible: IE=9
Date: Fri, 22 Oct 2021 02:48:46 GMT
Content-Length: 79062

<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Our global reach - OECD</title>
    
    <meta name="description" content="Today, our 38 Member countries span the globe, from North and South America to Europe and Asia-Pacific. Find out more about OECD members and partners" />
 <!-- description -->
    <meta name="keywords" content="member countries, oecd, members, oecd members, partners, oecd countries" />
 <!-- keywords -->
      
    <meta property="og:image" content="http://www.oecd.org/media/oecdorg/about/lay

In [4]:
#Gumbo is used to parse the HTML, before parsing it is converted to String
html_doc = parsehtml(String(request.body))

HTML Document:
<!DOCTYPE html>
HTMLElement{:HTML}:<HTML lang="en">
  <head>
    <meta charset="utf-8"/>
    <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
    <meta content="width=device-width, initial-scale=1" name="viewport"/>
    <title>
      Our global reach - OECD
    </title>
    <meta content="Today, our 38 Member countries span the globe, from North and South America to Europe and Asia-Pacific. Find out more about OECD members and partners" name="description"/>
    <meta content="member countries, oecd, members, oecd members, partners, oecd countries" name="keywords"/>
    <meta content="http://www.oecd.org/media/oecdorg/about/layer2pages/where-did-you-know.jpg" property="og:image"/>
    <meta content="OECD member countries and partners" property="og:title"/>
    <meta content="Today, our 38 Member countries span the globe, from North and South America to Europe and Asia-Pacific. Find out more about OECD members and partners" property="og:description"/>
    <meta conte

In [5]:
body = html_doc.root[2]

HTMLElement{:body}:<body>
  <script>
        dataLayer = [{
            'siteName': "oecd.org",
            'siteEnvironment': "live",
            'pageLanguage': "en"
        }];
      </script>
  <script>(function (w, d, s, l, i) {
        w[l] = w[l] || [];
        w[l].push({
            'gtm.start':
                new Date().getTime(), event: 'gtm.js'
        });
        var f = d.getElementsByTagName(s)[0],
            j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
        j.async = true;
        j.src = 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
        f.parentNode.insertBefore(j, f);
    })(window, document, 'script', 'dataLayer', 'GTM-P5JSM4P');  </script>
...


In [6]:
#Country names are in "#members a", which is an id and to extract each matching node
#in the XML document

sel_for_country_name = Selector("#members a")
country_list = eachmatch(sel_for_country_name,body)

38-element Vector{HTMLNode}:
 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/australia">
  Australia
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/austria">
  Austria
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/belgium">
  Belgium
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/canada">
  Canada
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/chile">
  Chile
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/colombia/">
  Colombia
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/costarica/">
  Costa Rica
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/czech/">
  Czech Republic
</a>


 HTMLElement{:a}:<a class="country-list__country" href="https://www.oecd.org/denmark/">
  Denmark
</a>


 HTMLElement{:a}:<a c

In [7]:
country_name = nodeText.(country_list) 

38-element Vector{String}:
 "Australia"
 "Austria"
 "Belgium"
 "Canada"
 "Chile"
 "Colombia"
 "Costa Rica"
 "Czech Republic"
 "Denmark"
 "Estonia"
 "Finland"
 "France"
 "Germany"
 ⋮
 "New Zealand"
 "Norway"
 "Poland"
 "Portugal"
 "Slovak Republic"
 "Slovenia"
 "Spain"
 "Sweden"
 "Switzerland"
 "Turkey"
 "United Kingdom"
 "United States"

In [8]:
#Transforms the data into a Data frame
country_df = DataFrame(name = country_name)

Unnamed: 0_level_0,name
Unnamed: 0_level_1,String
1,Australia
2,Austria
3,Belgium
4,Canada
5,Chile
6,Colombia
7,Costa Rica
8,Czech Republic
9,Denmark
10,Estonia


In [9]:
#To get the elements in "WorldBankData" package
names(WorldBankData)

3-element Vector{Symbol}:
 :WorldBankData
 :search_wdi
 :wdi

In [10]:
#search_wdi() is used along with a regular expression r"." to get all the country names and corresponding iso3c codes
wbd = search_wdi("countries", "iso3c", r".")
wbd_df = select!(wbd,:iso3c,:name)

Unnamed: 0_level_0,iso3c,name
Unnamed: 0_level_1,String,String
1,ABW,Aruba
2,AFE,Africa Eastern and Southern
3,AFG,Afghanistan
4,AFR,Africa
5,AFW,Africa Western and Central
6,AGO,Angola
7,ALB,Albania
8,AND,Andorra
9,ARB,Arab World
10,ARE,United Arab Emirates


In [11]:
#Join country_df and wbd_df by country name  
country_df = leftjoin(country_df, wbd_df, on=:name, matchmissing=:equal, makeunique=true)

Unnamed: 0_level_0,name,iso3c
Unnamed: 0_level_1,String,String?
1,Australia,AUS
2,Austria,AUT
3,Belgium,BEL
4,Canada,CAN
5,Chile,CHL
6,Colombia,COL
7,Costa Rica,CRI
8,Czech Republic,CZE
9,Denmark,DNK
10,Estonia,EST


In [18]:
# Checking missing values
ismissing(country_df)

false

In [13]:
filter(row -> row.name == "Korea",country_df) 

Unnamed: 0_level_0,name,iso3c
Unnamed: 0_level_1,String,String?
1,Korea,missing


In [14]:
#searches for Korea in WorldBankData using regEx where i checks for case sensitivity
search_cname = search_wdi("countries","name",r"Korea"i)
search_cname[!, :name]

2-element Vector{String}:
 "Korea, Rep."
 "Korea, Dem. People's Rep."

In [15]:
search_cname[!, :iso3c]

2-element Vector{String}:
 "KOR"
 "PRK"

In [16]:
#Replacing the missing value with "KOR" for iso3c
country_df[(country_df.name .=="Korea"), :iso3c] .= "KOR"

1-element view(::Vector{Union{Missing, String}}, [21]) with eltype Union{Missing, String}:
 "KOR"

In [17]:
country_df

Unnamed: 0_level_0,name,iso3c
Unnamed: 0_level_1,String,String?
1,Australia,AUS
2,Austria,AUT
3,Belgium,BEL
4,Canada,CAN
5,Chile,CHL
6,Colombia,COL
7,Costa Rica,CRI
8,Czech Republic,CZE
9,Denmark,DNK
10,Estonia,EST


In [18]:
#Export Dataframe into .csv file
CSV.write("OECD_country_codes.csv", country_df)

"OECD_country_codes.csv"