# Crawl Country Introduction

In [187]:
%%HTML
<style>
    body{
 --vscode-font-family: "ComicShannsMono Nerd Font";
    }
</style>

## Import Libraries

In [188]:
import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

## Set the parameters

In [189]:
world_country_datapath = "../data/countries.json"
country_intro_datapath = "../data/country_intro.json"

## Read all the countries from json file

In [190]:
country_data_list = json.loads(open(world_country_datapath).read())
print(json.dumps(country_data_list[0], indent=4))

{
    "Country Name": "Afghanistan",
    "ISO2": "AF",
    "ISO3": "AFG",
    "TLD": "af",
    "FIPS": "AF",
    "ISO Numeric": "004",
    "GeoNameID": "1149361",
    "E164": "93",
    "Phone Code": "93",
    "Continent Name": "Asia",
    "Continent Code": "as",
    "Capital": "Kabul",
    "Time Zone in Capital": "Asia/Kabul",
    "Currency Name": "Afghani",
    "Languages": "Afghan Persian or Dari (official) 50%, Pashto (official) 35%, Turkic languages (primarily Uzbek and Turkmen) 11%, 30 minor languages (primarily Balochi and Pashai) 4%, much bilingualism, but Dari functions as the lingua franca",
    "Area KM2": "647500"
}


## Crawl the introduction of each country

In [191]:
# open the chrome browser (english)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--lang=en-GB')
browser = webdriver.Chrome(options=options)

In [192]:
class CountryIntroduction():
    def __init__(self, country_name, browser, text=None, img=None):
        self.country_name = country_name
        self._browser = browser
        self._text = text
        self._img = img
        self.intro = self._get_introduction()
        self.image_path = self._get_country_flag_image_url()
    
    def _get_introduction(self) -> str:
        if self._text is not None:
            return self._text
        search_url = f"https://www.google.com/search?q={self.country_name}+Country"
        # search for the country introduction
        self._browser.get(search_url)
        h3_elements = self._browser.find_elements(By.XPATH, "//h3")
        h3_target = None
        for h3 in h3_elements:
            if h3.text != "Description":
                continue
            h3_target = h3
        assert h3_target is not None, "No introduction found"
        country_intro_text = h3_target.find_element(By.XPATH, "../span").text
        return country_intro_text
    
    def _get_country_flag_image_url(self) -> str:
        if self._img is not None:
            return self._img
        name = self.country_name.replace(" ", "-")
        url = f"https://www.countryflags.com/flag-of-{name}/"
        browser.get(url)
        image = browser.find_element(By.XPATH, "//img[@class='img-fluid wp-post-image']").get_attribute("src")
        return image
    
    def check_valid(self):
        def check_image_url(image_url):
            try:
                response = requests.get(image_url)
                return response.status_code == 200
            except requests.exceptions.RequestException:
                return False

        if len(self.intro) < 10 or check_image_url(self.image_path) is False:
            return False
        return True

In [226]:
check_dict = {
    "American Samoa": {"img": "https://www.countryflags.com/wp-content/uploads/samoa-flag-png-large.png"},
    "Anguilla": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Flag_of_Anguilla.svg/800px-Flag_of_Anguilla.svg.png"},
    "Antarctica": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Antarctica_%28orthographic_projection%29.svg/640px-Antarctica_%28orthographic_projection%29.svg.png"},
    "Aruba": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f6/Flag_of_Aruba.svg/1200px-Flag_of_Aruba.svg.png"},
    "Bermuda": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/bf/Flag_of_Bermuda.svg/800px-Flag_of_Bermuda.svg.png"},
    "British Virgin Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/4/42/Flag_of_the_British_Virgin_Islands.svg"},
    "Cayman Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/0/0f/Flag_of_the_Cayman_Islands.svg"},
    "Christmas Island": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/9/99/Map_of_Christmas_Island_1976.jpg/390px-Map_of_Christmas_Island_1976.jpg"},
    "Cocos Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/7/74/Flag_of_the_Cocos_%28Keeling%29_Islands.svg"},
    "Cook Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/35/Flag_of_the_Cook_Islands.svg/1200px-Flag_of_the_Cook_Islands.svg.png"},
    "Democratic Republic of the Congo": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Flag_of_the_Democratic_Republic_of_the_Congo.svg/1200px-Flag_of_the_Democratic_Republic_of_the_Congo.svg.png"},
    "Falkland Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Flag_of_the_Falkland_Islands.svg/1200px-Flag_of_the_Falkland_Islands.svg.png"},
    "Faroe Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3c/Flag_of_the_Faroe_Islands.svg/1200px-Flag_of_the_Faroe_Islands.svg.png"},
    "French Polynesia": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/db/Flag_of_French_Polynesia.svg/1200px-Flag_of_French_Polynesia.svg.png"},
    "Gibraltar": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/Flag_of_Gibraltar.svg/1200px-Flag_of_Gibraltar.svg.png"},
    "Guernsey": {"img": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Flag_of_Guernsey.svg"},
    "Isle of Man": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/5d/Flag_of_the_Isle_of_Mann.svg/1200px-Flag_of_the_Isle_of_Mann.svg.png"},
    "Ivory Coast": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Flag_of_C%C3%B4te_d%27Ivoire.svg/1200px-Flag_of_C%C3%B4te_d%27Ivoire.svg.png"},
    "Jersey": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1c/Flag_of_Jersey.svg/800px-Flag_of_Jersey.svg.png"},
    "Macau": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/63/Flag_of_Macau.svg/800px-Flag_of_Macau.svg.png"},
    "Mayotte": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/bf/Coat_of_Arms_of_Mayotte.svg/640px-Coat_of_Arms_of_Mayotte.svg.png"},
    "Montserrat": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Flag_of_Montserrat.svg/1200px-Flag_of_Montserrat.svg.png"},
    "Netherlands Antilles": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/eb/Flag_of_the_Netherlands_Antilles_%281959%E2%80%931986%29.svg/1200px-Flag_of_the_Netherlands_Antilles_%281959%E2%80%931986%29.svg.png"},
    "North Korea": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Flag_of_North_Korea.svg/1200px-Flag_of_North_Korea.svg.png"},
    "Northern Mariana Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e0/Flag_of_the_Northern_Mariana_Islands.svg/1200px-Flag_of_the_Northern_Mariana_Islands.svg.png"},
    "Republic of the Congo": {"img": "https://upload.wikimedia.org/wikipedia/commons/9/92/Flag_of_the_Republic_of_the_Congo.svg"},
    "Reunion": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Reunion_21.12S_55.51E.jpg/1200px-Reunion_21.12S_55.51E.jpg"},
    "Saint Barthelemy": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Flag_of_Saint_Barth%C3%A9lemy_%28local%29.svg/1200px-Flag_of_Saint_Barth%C3%A9lemy_%28local%29.svg.png"},
    "Saint Helena": {
        "text": "Saint Helena is one of the three constituent parts of Saint Helena, Ascension and Tristan da Cunha, a remote British overseas territory.",
        "img": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Flag_of_Saint_Helena.svg/1024px-Flag_of_Saint_Helena.svg.png"
    },
    "Saint Martin": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Flag_of_Saint-Martin_%28fictional%29.svg/800px-Flag_of_Saint-Martin_%28fictional%29.svg.png"},
    # "Saint Pierre and Miquelon": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Flag_of_Saint-Pierre_and_Miquelon.svg/2560px-Flag_of_Saint-Pierre_and_Miquelon.svg.png"},
    "Saint Pierre and Miquelon": {"img": "https://upload.wikimedia.org/wikipedia/commons/c/c3/Flag_of_France.svg"},
    "Sint Maarten": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Flag_of_Sint_Maarten.svg/1200px-Flag_of_Sint_Maarten.svg.png"},
    "South Korea": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/09/Flag_of_South_Korea.svg/800px-Flag_of_South_Korea.svg.png"},
    "Svalbard and Jan Mayen": {"img": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSc63-X9NAK4ZXDl6Nqrc6o4fDoqi9qOlZomMuEBAk_6g&s"},
    "Turks and Caicos Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a0/Flag_of_the_Turks_and_Caicos_Islands.svg/1200px-Flag_of_the_Turks_and_Caicos_Islands.svg.png"},
    "U.S. Virgin Islands": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/United_States_Virgin_Islands_on_the_globe_%28Americas_centered%29.svg/250px-United_States_Virgin_Islands_on_the_globe_%28Americas_centered%29.svg.png"},
    "Vanuatu": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_Vanuatu.svg/1200px-Flag_of_Vanuatu.svg.png"},
    "Wallis and Futuna": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Flag_of_Wallis_and_Futuna.svg/1200px-Flag_of_Wallis_and_Futuna.svg.png"},
    "Western Sahara": {"img": "https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Flag_of_the_Sahrawi_Arab_Democratic_Republic.svg/800px-Flag_of_the_Sahrawi_Arab_Democratic_Republic.svg.png"},
    "Republic of the Congo": {
        "text": "The Republic of the Congo, also known as Congo-Brazzaville, West Congo, Congo Republic, ROC, or simply either Congo or the Congo, is a country located on the western coast of Central Africa to the west of the Congo River. It is bordered to the west by Gabon, to the northwest by Cameroon, to the northeast by the Central African Republic, to the southeast by the Democratic Republic of the Congo, to the south by the Angolan exclave of Cabinda, and to the southwest by the Atlantic Ocean.",
        "img": "https://upload.wikimedia.org/wikipedia/commons/thumb/9/92/Flag_of_the_Republic_of_the_Congo.svg/125px-Flag_of_the_Republic_of_the_Congo.svg.png"
    }
}

In [194]:
black_list = [
    "British Indian Ocean Territory",
    "Pitcairn",
    "Tokelau",
]

In [195]:
country_intro_dict = {}

In [228]:
for data in tqdm(country_data_list):
    country_name = data['Country Name']
    continent_name = data['Continent Name']
    if country_name in black_list or country_name in country_intro_dict:
        continue
    text, img = None, None
    if country_name in check_dict:
        if "text" in check_dict[country_name]:
            text = check_dict[country_name]["text"]
        if "img" in check_dict[country_name]:
            img = check_dict[country_name]["img"]
    country_intro = CountryIntroduction(country_name, browser, text=text, img=img)
    if country_intro.check_valid() == False:
        print(f"Error in {country_name}")
        break
    country_intro_dict[country_name] = {
        "intro_text": country_intro.intro,
        "flag_image": country_intro.image_path
    }

100%|██████████| 240/240 [00:08<00:00, 27.38it/s]


## Check and Save

In [229]:
# save the country attraction data
with open(country_intro_datapath, "w") as f:
    f.write(json.dumps(country_intro_dict, indent=4))