# Crawl Attractions

In [191]:
%%HTML
<style>
    body{
 --vscode-font-family: "ComicShannsMono Nerd Font";
    }
</style>

## Import libraries

In [192]:
import os
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

## Set the parameters

In [193]:
world_country_datapath = "data/countries.json"
attraction_datapath = "data/attractions.json"
country_continent_mapping_datapath = "data/country_continent_mapping.json"

## Read all the countries from json file

In [194]:
country_data_list = json.loads(open(world_country_datapath).read())
print(json.dumps(country_data_list[0], indent=4))

{
    "Country Name": "Afghanistan",
    "ISO2": "AF",
    "ISO3": "AFG",
    "TLD": "af",
    "FIPS": "AF",
    "ISO Numeric": "004",
    "GeoNameID": "1149361",
    "E164": "93",
    "Phone Code": "93",
    "Continent Name": "Asia",
    "Continent Code": "as",
    "Capital": "Kabul",
    "Time Zone in Capital": "Asia/Kabul",
    "Currency Name": "Afghani",
    "Languages": "Afghan Persian or Dari (official) 50%, Pashto (official) 35%, Turkic languages (primarily Uzbek and Turkmen) 11%, 30 minor languages (primarily Balochi and Pashai) 4%, much bilingualism, but Dari functions as the lingua franca",
    "Area KM2": "647500"
}


## Crawl the attractions of each country

In [195]:
# open the chrome browser (english)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--lang=en-GB')
browser = webdriver.Chrome(options=options)

In [196]:
class CountryAttraction():
    def __init__(self, country_name, browser):
        self.country_name = country_name
        self._browser = browser
        self.attractions = self._get_attraction_info()
    
    def _get_attraction_info(self) -> list[dict]:
        attraction_list = []
        search_url = f"https://www.google.com/search?q={self.country_name}+attractions"
        # search for the country's attractions
        self._browser.get(search_url)
        # click the "More attractions" button
        button = browser.find_element(By.XPATH, "//span[contains(text(), 'More things to do')]")
        button.click()
        # find  the grid view that "role" == "presentation"
        grid_view = browser.find_element(By.TAG_NAME, "wp-grid-view")
        attraction_element_list = grid_view.find_elements(By.CSS_SELECTOR, "div[role='presentation']")
        for attraction in attraction_element_list:
            # get the image url
            img_url_element = attraction.find_elements(By.TAG_NAME, "img")
            img_url = img_url_element[0].get_attribute("src") if len(img_url_element) != 0 else None
            # get the attraction name
            attraction_name_element = attraction.find_elements(By.TAG_NAME, "span")
            attraction_name = attraction_name_element[0].text if len(attraction_name_element) != 0 else None
            # get the star score
            start_score_element = attraction.find_elements(By.CSS_SELECTOR, "span[aria-hidden]")
            star_score = start_score_element[0].text if len(start_score_element) != 0 else None
            if attraction_name and img_url and star_score:
                attraction_list.append({
                    "name": attraction_name,
                    "image_url": img_url,
                    "score": star_score
                })
        return attraction_list

In [209]:
black_list = [
    "British Indian Ocean Territory",
    "Pitcairn",
    "Tokelau"
]

In [198]:
continent_dict = {}
country_attraction_dict = {}

In [211]:
def add_country_to_continent(country_name, continent_name, continent_dict):
    if continent_name not in continent_dict:
        continent_dict[continent_name] = []
    continent_dict[continent_name].append(country_name)

for data in tqdm(country_data_list):
    country_name = data['Country Name']
    continent_name = data['Continent Name']
    if country_name in black_list or country_name in country_attraction_dict:
        continue
    add_country_to_continent(country_name, continent_name, continent_dict)
    country_attraction = CountryAttraction(country_name, browser)
    country_attraction_dict[country_name] = country_attraction.attractions

100%|██████████| 240/240 [00:41<00:00,  5.84it/s] 


## Check and Save

In [220]:
cnt = 0
threshold = 0
for country_name, attraction_list in country_attraction_dict.items():
    if len(attraction_list) < threshold:
        cnt += 1
        print(f"{cnt:>3} : {country_name}")

In [212]:
# save the country attraction data
with open(attraction_datapath, "w") as f:
    f.write(json.dumps(country_attraction_dict, indent=4))

# save the country continent mapping data
with open(country_continent_mapping_datapath, "w") as f:
    f.write(json.dumps(continent_dict, indent=4))