# Google Images Scraping
### Friso Vijverberg
#### Adapted from 'https://www.33rdsquare.com/how-to-scrape-google-images/'

In [102]:
import requests
from bs4 import BeautifulSoup
import urllib.request 
import csv
import os

In [103]:
# List of birds required
keywords = ["saddle billed stork", "egyptian goose", "fish eagle", "ostrich", "cape gull"]

# Iterate over each keyword
for search_term in keywords:
    
    # Encode the search term to URL-friendly format
    encoded_term = urllib.parse.quote_plus(search_term) 
    
    # Set number of pages to scrape
    num_pages = 20
    
    # Create array to store results data
    results = []
    
    for page in range(num_pages):
    
        # Construct search URL with page number  
        url = f"https://www.google.com/search?q={search_term}&tbm=isch&start={page*100}"
    
        # Request page
        response = requests.get(url)
    
        # Create BeautifulSoup object from response
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find all img tags 
        image_tags = soup.find_all('img')
        
        # To be used to ignore google logo
        firstImage = True 
        
        # Iterate over image tags
        for image in image_tags:
    
            # Ignore google logo
            if firstImage:
                firstImage = False
                continue
                
            # Get image source URL
            src = image.get('src')
        
            # Get image alt text 
            alt = image.get('alt') 
        
            # Store data
            image_data = {'src': src, 'alt': alt}
        
            # Add to results list 
            results.append(image_data)
            
    # Folder to save images
    save_folder = search_term
    
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)
        
    n = 0 # Basic counter for labelling
    for image in results:
    
        url = image['src']
        title = "Img" + str(n) 
    
        # Concatenate folder and file name
        file_name = f"data/{save_folder}/{title}.jpg"
    
        # Download image to folder
        urllib.request.urlretrieve(url, file_name)
        n+=1

In [100]:
# CSV column headers 
headers = ['Title', 'URL']

# NB! Data will be overwritten each time the program runs.
if os.path.exists('google_images.csv'):
    os.remove('google_images.csv')
with open('google_images.csv', 'w') as outputfile:

  writer = csv.writer(outputfile)

  # Write column headers
  writer.writerow(headers)

  # Write data rows
  for image in results:
    row = [image['alt'], image['src']]
    writer.writerow(row)