In [1]:
import os
import re
import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [2]:
df = pd.read_csv('tourism_name.csv')
df.head()

Unnamed: 0,Place_Id,Place_Name
0,1,Monumen Nasional
1,2,Kota Tua
2,3,Dunia Fantasi
3,4,Taman Mini Indonesia Indah (TMII)
4,5,Atlantis Water Adventure


In [3]:
keywords = df['Place_Name'].copy().tolist()
keywords_len = len(keywords)

print(f'There are {keywords_len} tourist attraction names in the dataset:')

for idx, place_name in enumerate(keywords[:3]):
    print(f'{idx + 1}. {place_name}')
print('...')
for idx, place_name in enumerate(keywords[-3:]):
    print(f'{keywords_len-2 + idx}. {place_name}')

There are 437 tourist attraction names in the dataset:
1. Monumen Nasional
2. Kota Tua
3. Dunia Fantasi
...
435. Taman Air Mancur Menari Kenjeran
436. Taman Flora Bratang Surabaya
437. Gereja Perawan Maria Tak Berdosa Surabaya


In [None]:
def search_images(keyword: str, num:int, driver: webdriver) -> None:
    """
    Scrapes images based on the given keyword.

    Args:
        keyword (str): The search term used to find images.
        num (int): The number of images to scrape and download.
        driver (webdriver): The Selenium WebDriver instance used for browsing.

    Returns:
        None

    Directories:
        - `images_output/<keyword>`: Stores downloaded images.
        - `csv_output`: Stores CSV files with metadata for scraped images.

    Output:
        - Downloads up to `num` images to the `images_output/<keyword>` directory.
        - Saves a CSV file containing image metadata in the `csv_output` directory.
    """


    # Make sure the keyword name does not contain the "\" or "/" signs.
    keyword = keyword.replace('\\', ' ').replace('/', ' ')
    
    img_xpath = '/html/body/div[11]/div[2]/div[3]/div/div/c-wiz/div/div[2]/div[2]/div/div[2]/c-wiz/div/div[3]/div[1]/a/img[1]'
    search_url = f"https://www.google.com/search?tbm=isch&q={keyword}"
    img_output_dir = 'images_output'
    csv_output_dir = 'csv_output'
    data = list()
    counter = 0

    if not os.path.exists(csv_output_dir):
        os.makedirs(csv_output_dir)
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)
    if not os.path.exists(os.path.join(img_output_dir, keyword)):
        os.makedirs(os.path.join(img_output_dir, keyword))
    
    driver.get(search_url)
    time.sleep(3)

    imgs_pointer = driver.find_elements(By.CSS_SELECTOR,'div.F0uyec')
    
    for img_button in imgs_pointer:
        
        # If the number of image requests is met, then stop.
        if len(data) == num:
            break

        # Click and display the original image.
        driver.execute_script("arguments[0].click();", img_button)

        # Give 2 chances to get the original image link,
        # rather than the cached version of the image.
        for _ in range(2):
            time.sleep(2)
            img_link = driver.find_element(By.XPATH, img_xpath).get_attribute('src')
            cached_img = img_link.startswith('https://encrypted-tbn0.gstatic.com/')
            jpg_png = re.search(r'\.(jpg|png).*$', img_link)
            if not cached_img and jpg_png:
                break
        
        # If still only get the cache image, then get another image.
        if cached_img or not jpg_png:
            continue

        try:
            # Try downloading the image
            img_data = requests.get(img_link).content
            img_path = os.path.join(img_output_dir, keyword, f'{keyword}_{counter}.{jpg_png[1]}')
            with open(img_path, 'wb') as handler:
                handler.write(img_data)
        except Exception as e:
            print(e)
            continue
        
        # Append the link to data
        data.append({
            'keyword': keyword,
            'link': img_link
        })

        print('  ' + img_link)
        counter += 1

    # Save all the links to CSV file
    temp_df = pd.DataFrame(data)
    csv_path = os.path.join(csv_output_dir, f'{keyword}.csv')
    temp_df.to_csv(csv_path, index=False)
    print(f"  Successfully downloaded {len(data)} {keyword} images")

In [7]:
# If an error occurs during the scraping process,
# please replace the `start_idx`` number with the last
# index number of the tourist attraction being scraped
start_idx = 0
keywords_start = keywords[start_idx:]
print(f'A total of {len(keywords_start)} tourist attractions will be scraped')

A total of 437 tourist attractions will be scraped


In [6]:
num_of_image = 3
driver = webdriver.Firefox()
driver.get('https://google.com')
    
# Running the seach function through the all keywords
for index, keyword in enumerate(keywords_start):
    print(f"{index + start_idx} : Scraping {keyword} images")
    search_images(keyword=keyword, num=num_of_image, driver=driver)

driver.close()

0 : Scraping Monumen Nasional images
  https://upload.wikimedia.org/wikipedia/id/thumb/b/b1/Merdeka_Square_Monas_02.jpg/800px-Merdeka_Square_Monas_02.jpg
  https://dynamic-media-cdn.tripadvisor.com/media/photo-o/15/c3/d2/54/jakarta-amazing-tour.jpg?w=800&h=500&s=1
  https://cozzy.id/uploads/0000/630/2024/09/04/cozzyid-hotel-murah-hotel-terdekat-penginapan-murah-penginapan-terdekat-booking-hotel-monumen-nasional-monas-ikon-jakarta-yang-membanggakan-sumber-gambar-kompas.jpg
  Successfully downloaded 3 Monumen Nasional images
1 : Scraping Kota Tua images
  https://upload.wikimedia.org/wikipedia/commons/e/ee/Fatahillah.jpg
  https://dynamic-media-cdn.tripadvisor.com/media/photo-o/14/fc/01/83/wisata-kota-tua-mel-s.jpg?w=900&h=-1&s=1
  https://asset.kompas.com/crops/XK3vTK30F0d4NLCQ5oIMEGs9oMk=/0x0:1800x1200/1200x800/data/photo/2021/11/11/618cb2bd3245b.jpg
  Successfully downloaded 3 Kota Tua images
2 : Scraping Dunia Fantasi images
  https://s-light.tiket.photos/t/01E25EBZS3W0FY9GTG6C42E1SE