In [1]:
# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as BS
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import time

In [2]:
# Begin timing
start_time = datetime.now()

In [3]:
# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', headless=True)

In [4]:
# Visit the 'CIA World Factbook' site
url = 'https://www.cia.gov/the-world-factbook/countries/'
browser.visit(url)

In [5]:
# Locate the country links on each page

# Create variables for the links and text
c_links = []
c_txt = []

# Parse the HTML on each page and collect the country urls
for i in range(22):
    time.sleep(2)
    html = browser.html
    html_soup = BS(html, 'html.parser')

    country_row = html_soup.find('div', class_='col-lg-9 col-md-12 col-sm-12')
    for url in country_row.find_all('a'):
        pg_link = url.get('href')
        # print(pg_link)
        txt = url.get_text()

        # Skip the US Pacific Island Wildlife Refuges
        if 'united-states-pacific-island-wildlife-refuges' in pg_link:
            continue
            
        # Replace the segmented country names
        if 'World' in txt:
            continue
        elif 'West Bank' in txt:
            continue
        elif 'European Union' in txt:
            continue
        elif 'Venezuela' in txt:
            txt = 'Venezuela'
        elif 'Taiwan' in txt:
            txt = 'Taiwan (China)'
        elif 'Tanzania' in txt:
            txt = 'Tanzania'
        elif 'Micronesia' in txt:
            txt = 'Micronesia'              
        elif 'Korea, South' in txt:
            txt = 'South Korea'  
        elif 'Korea, North' in txt:
            txt = 'North Korea'            
        elif 'Hong Kong' in txt:
            txt = 'Hong Kong (China)'
        elif 'Gambia' in txt:
            txt = 'Gambia'
        elif 'Congo, Republic' in txt:
            txt = 'Republic of the Congo'
        elif 'Congo, Democratic' in txt:
            txt = 'Democratic Republic of the Congo'
        elif 'Bahamas' in txt:
            txt = 'Bahamas'

        c_links.append(pg_link)
        c_txt.append(txt)
        
    # Locate the next country page button and advance
    browser.find_by_css('.pagination__arrow-right').click() 

In [6]:
len(c_links)

251

In [7]:
#len(c_txt)

In [8]:
# Build profile page urls
base_url = 'https://www.cia.gov'
data_date = []
gov_type = []
prof_urls = []

for lnk in c_links:
    time.sleep(1)
    new_url = base_url + lnk
    prof_urls.append(new_url)
    
    browser.visit(new_url)
    # Parse the HTML
    prof_html = browser.html
    prof_soup = BS(prof_html, 'html.parser')
    
    # Locate the date of last profile update
    prof_update = prof_soup.find('label', class_='header-subsection-date')
    data_date.append(prof_update.get_text())
    #print(data_date)

    # Locate the type of government
    try:
        prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Government type" in tag.text).next_element.next_element.get_text()
        #print(f'First try - : {prof_cont}')
    except:
        try:
            prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Diplomatic representation in the US" in tag.text).next_element.next_element.get_text()
            # print(f'Exception try - : {prof_cont}')
        except:
            try:
                prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Independence" in tag.text).next_element.next_element.get_text()
                # print(f'Exception try 2 - : {prof_cont}')
            except:
                try:
                    prof_cont = prof_soup.find(lambda tag:tag.name=="a" and "Dependency status" in tag.text).next_element.next_element.get_text()
                    # print(f'Exception try 3 - : {prof_cont}')
                except:
                    prof_cont = 'unknown'

    # print(prof_cont)
    gov_type.append(prof_cont)

In [9]:
end_time = datetime.now()
print('Data Collection Duration: {}'.format(end_time - start_time))

Data Collection Duration: 0:06:36.425567


In [10]:
gov_type

['theocratic; the United States does not recognize the Taliban Government',
 'none (overseas territory of the UK)',
 'parliamentary republic',
 'presidential republic',
 'unincorporated, unorganized Territory of the US with local self-government; republican form of territorial government with separate executive, legislative, and judicial branches',
 "parliamentary democracy (since March 1993) that retains its chiefs of state in the form of a co-principality; the two princes are the President of France and Bishop of Seu d'Urgell, Spain",
 'presidential republic',
 'parliamentary democracy (House of Assembly); self-governing overseas territory of the UK',
 '',
 'parliamentary democracy under a constitutional monarchy; a Commonwealth realm',
 'presidential republic',
 'parliamentary democracy; note - constitutional changes adopted in December 2015 transformed the government to a parliamentary system',
 'parliamentary democracy; part of the Kingdom of the Netherlands',
 'none (territory of

In [11]:
# Convert the lists to DataFrames
data_date_ser = pd.Series(data_date)                                                                                                                             
data_date_df = data_date_ser.to_frame()

gov_type_ser = pd.Series(gov_type)                                                                                                                             
gov_type_df = gov_type_ser.to_frame()

prof_urls_ser = pd.Series(prof_urls)                                                                                                                             
prof_urls_df = prof_urls_ser.to_frame()

c_txt_ser = pd.Series(c_txt)                                                                                                                             
c_txt_df = c_txt_ser.to_frame()


In [12]:
data_date_df

Unnamed: 0,0
0,"Page last updated: November 28, 2023"
1,"Page last updated: November 17, 2023"
2,"Page last updated: November 17, 2023"
3,"Page last updated: November 29, 2023"
4,"Page last updated: November 20, 2023"
...,...
246,"Page last updated: November 14, 2023"
247,"Page last updated: November 14, 2023"
248,"Page last updated: November 14, 2023"
249,"Page last updated: November 29, 2023"


In [13]:
# Merge the DataFrames and rename the columns
country_government_df = c_txt_df.merge(gov_type_df, left_index=True, right_index=True)

country_government_df = country_government_df.rename({'0_x': 'country', '0_y': 'government_type'}, axis=1)

country_government_df

Unnamed: 0,country,government_type
0,Afghanistan,theocratic; the United States does not recogni...
1,Akrotiri,none (overseas territory of the UK)
2,Albania,parliamentary republic
3,Algeria,presidential republic
4,American Samoa,"unincorporated, unorganized Territory of the U..."
...,...,...
246,Wake Island,none (territory of the US)
247,Wallis and Futuna,parliamentary democracy (Territorial Assembly)...
248,Yemen,in transition
249,Zambia,presidential republic


In [14]:
# Merge the DataFrames and rename the columns
country_government_df = country_government_df.merge(data_date_df, left_index=True, right_index=True)

country_government_df = country_government_df.rename({'country': 'country', 'government_type': 'government_type', '0': 'profile_data_date'}, axis=1)

country_government_df

Unnamed: 0,country,government_type,0
0,Afghanistan,theocratic; the United States does not recogni...,"Page last updated: November 28, 2023"
1,Akrotiri,none (overseas territory of the UK),"Page last updated: November 17, 2023"
2,Albania,parliamentary republic,"Page last updated: November 17, 2023"
3,Algeria,presidential republic,"Page last updated: November 29, 2023"
4,American Samoa,"unincorporated, unorganized Territory of the U...","Page last updated: November 20, 2023"
...,...,...,...
246,Wake Island,none (territory of the US),"Page last updated: November 14, 2023"
247,Wallis and Futuna,parliamentary democracy (Territorial Assembly)...,"Page last updated: November 14, 2023"
248,Yemen,in transition,"Page last updated: November 14, 2023"
249,Zambia,presidential republic,"Page last updated: November 29, 2023"


In [15]:
country_government_df.to_csv('country_government_systems.csv', index=False)

In [16]:
country_name_df = c_txt_df

In [None]:
# Merge the DataFrames and rename the columns
country_urls_df = country_name_df.merge(prof_urls_df, left_index=True, right_index=True)

country_urls_df  = country_urls_df .rename({'0_x': 'country', '0_y': 'profile_url'}, axis=1)

country_urls_df 

In [None]:
country_urls_df.to_csv('country_government_urls.csv', index=False)