In [1]:
# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as BS
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

In [2]:
# Begin timing
start_time = datetime.now()

In [3]:
# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# Visit the 'International Social Security Association' site
url = 'https://ww1.issa.int/country-profiles'
browser.visit(url)

In [5]:
# Parse the HTML
html = browser.html
html_soup = BS(html, 'html.parser')

In [6]:
# Look at the data - if needed
# print(html_soup.prettify())

In [7]:
# Create variables for the links and text
c_links = []
c_txt = []

# Open the full list of country links
country_row = html_soup.find('div', class_='row')
for url in country_row.find_all('a'):
    pg_link = url.get('href')
    #print(pg_link)
    txt = url.get_text()
    # Replace the segmented country names
    if 'Venezuela' in txt:
        txt = 'Venezuela'
    elif 'Tanzania' in txt:
        txt = 'Tanzania'
    elif 'Maldives' in txt:
        continue
    elif 'Monaco' in txt:
        txt = 'Monaco'
    elif 'Moldova' in txt:
        txt = 'Moldova'
    elif 'Korea' in txt:
        txt = 'South Korea'
    elif 'Hong Kong' in txt:
        txt = 'Hong Kong (China)'
    elif 'Iran' in txt:
        txt = 'Iran'
    elif 'Congo, Democratic' in txt:
        txt = 'Democratic Republic of the Congo'
    elif 'Bolivia' in txt:
        txt = 'Bolivia'
    elif 'Bahrain' in txt:
        txt = 'Bahrain'
        
    if 'country=' in str(pg_link):
        c_links.append(pg_link)
        c_txt.append(txt)
        

In [8]:
# Build profile page urls
base_url = 'https://ww1.issa.int'
data_date = []
ss_type = []
prof_urls = []

for lnk in c_links:
    new_url = base_url + lnk
    prof_urls.append(new_url)
    
    browser.visit(new_url)
    # Parse the HTML
    prof_html = browser.html
    prof_soup = BS(prof_html, 'html.parser')
    
    # Locate the date of last profile update
    prof_update = prof_soup.find('div', class_='profile-update-date')
    data_date.append(prof_update.b.get_text())

    # Locate the type of system
    try:
        prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Sickness and maternity" in tag.text).find_next(lambda tag:tag.name=="strong" and "Type of program:" in tag.text).next_sibling
        # print(f'First try - : {prof_cont}')
    except:
        try:
            prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Old age, invalidity and survivors" in tag.text).find_next(lambda tag:tag.name=="strong" and "Type of program:" in tag.text).next_sibling
            # print(f'Exception try - : {prof_cont}')
        except:
            prof_cont = 'unknown'
            
    # print(prof_cont)
    ss_type.append(prof_cont)


In [9]:
end_time = datetime.now()
print('Data Collection Duration: {}'.format(end_time - start_time))

Data Collection Duration: 0:06:21.651680


In [10]:
len(ss_type)

184

In [11]:
len(c_links)

184

In [12]:
# Convert the lists to DataFrames
data_date_ser = pd.Series(data_date)                                                                                                                             
data_date_df = data_date_ser.to_frame()

ss_type_ser = pd.Series(ss_type)                                                                                                                             
ss_type_df = ss_type_ser.to_frame()

prof_urls_ser = pd.Series(prof_urls)                                                                                                                             
prof_urls_df = prof_urls_ser.to_frame()

c_txt_ser = pd.Series(c_txt)                                                                                                                             
c_txt_df = c_txt_ser.to_frame()


In [13]:
data_date_df

Unnamed: 0,0
0,January 2018
1,January 2019
2,January 2018
3,January 2019
4,July 2019
...,...
179,July 2019
180,July 2018
181,July 2018
182,January 2019


In [14]:
# Merge the DataFrames and rename the columns
country_social_security_df = c_txt_df.merge(ss_type_df, left_index=True, right_index=True)

country_social_security_df  = country_social_security_df.rename({'0_x': 'country', '0_y': 'system_type'}, axis=1)

country_social_security_df

Unnamed: 0,country,system_type
0,Albania,Universal (medical benefits) and social insur...
1,Algeria,Social insurance system.
2,Andorra,Social insurance system.
3,Angola,"Universal (medical benefits), social insuranc..."
4,Antigua and Barbuda,Social insurance system.
...,...,...
179,Venezuela,Universal (birth grant and medical benefits) ...
180,Viet Nam,Social insurance system.
181,Yemen,Universal (medical benefits) and employer-lia...
182,Zambia,Universal (medical benefits) and employer-lia...


In [15]:
# Merge the DataFrames and rename the columns
country_social_security_df = country_social_security_df.merge(data_date_df, left_index=True, right_index=True)

country_social_security_df  = country_social_security_df.rename({'country': 'country', 'system_type': 'system_type', '0': 'profile_data_date'}, axis=1)

country_social_security_df

Unnamed: 0,country,system_type,0
0,Albania,Universal (medical benefits) and social insur...,January 2018
1,Algeria,Social insurance system.,January 2019
2,Andorra,Social insurance system.,January 2018
3,Angola,"Universal (medical benefits), social insuranc...",January 2019
4,Antigua and Barbuda,Social insurance system.,July 2019
...,...,...,...
179,Venezuela,Universal (birth grant and medical benefits) ...,July 2019
180,Viet Nam,Social insurance system.,July 2018
181,Yemen,Universal (medical benefits) and employer-lia...,July 2018
182,Zambia,Universal (medical benefits) and employer-lia...,January 2019


In [16]:
country_social_security_df.to_csv('country_social_security_systems.csv', index=False)

In [17]:
country_name_df = c_txt_df

In [18]:
# Merge the DataFrames and rename the columns
country_urls_df = country_name_df.merge(prof_urls_df, left_index=True, right_index=True)

country_urls_df  = country_urls_df .rename({'0_x': 'country', '0_y': 'profile_url'}, axis=1)

country_urls_df 

Unnamed: 0,country,profile_url
0,Albania,https://ww1.issa.int/node/195543?country=786
1,Algeria,https://ww1.issa.int/node/195543?country=787
2,Andorra,https://ww1.issa.int/node/195543?country=789
3,Angola,https://ww1.issa.int/node/195543?country=790
4,Antigua and Barbuda,https://ww1.issa.int/node/195543?country=792
...,...,...
179,Venezuela,https://ww1.issa.int/node/195543?country=1007
180,Viet Nam,https://ww1.issa.int/node/195543?country=1008
181,Yemen,https://ww1.issa.int/node/195543?country=1009
182,Zambia,https://ww1.issa.int/node/195543?country=1010


In [19]:
country_urls_df.to_csv('country_profile_urls.csv', index=False)