In [1]:
import argparse
import json
import itertools
import logging
import re
import os
import uuid
import sys
from urllib.request import urlopen, Request

from bs4 import BeautifulSoup

In [12]:
class google_image_scrapper:
    def __init__(self,num_images,output_dir):
        self.REQUEST_HEADER = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
        self.base_url="https://www.google.co.in/search?q={}&source=lnms&tbm=isch"
        self.num_images=num_images
        self.output_dir=output_dir
        self.timeout=10 
        
    def get_soup(self,url):  
        response = urlopen(Request(url, headers=self.REQUEST_HEADER))
        return BeautifulSoup(response, 'html.parser')
    
    def get_query_url(self,search_query):#returns google url
        return self.base_url.format("+".join(search_query.split()))
    
    def extract_images_from_soup(self,soup):#get links for the images from the bs4 tree
        image_elements = soup.find_all("div", {"class": "rg_meta"})
        metadata_dicts = (json.loads(e.text) for e in image_elements)
        link_type_records = ((d["ou"], d["ity"]) for d in metadata_dicts)
        return link_type_records
    
    def extract_images(self,p_name,p_occupation,p_country):#reurns the first num image links
        url = self.get_query_url("{} ({} of {})".format(p_name,p_occupation,p_country))
        print ("Souping")
        soup = self.get_soup(url)#get page afetr the query is searched on google
        print ("Extracting image urls")
        link_type_records = self.extract_images_from_soup(soup)#scraping image links from the page
        return itertools.islice(link_type_records, self.num_images)
    
    def get_raw_image(self,url): #download and return image from the link
        req = Request(url, headers=self.REQUEST_HEADER)
        try:
            resp = urlopen(req,timeout=self.timeout)
            resp=resp.read()
        except timeout :
            print ("timeout occured")
            resp=None
        except (HTTPError, URLError) as error:
            print ('Data of %s not retrieved because %s\nURL: %s', name, error, url)
            resp=None
        return resp
    
    def save_image(self,raw_image, image_type,image_number,p_name,p_occupation,p_country):#save an image to the appropriate dir
        extension = image_type if image_type else 'jpg'
        save_path = os.path.join(self.output_dir, p_country)
        save_path = os.path.join(save_path, p_occupation)
        save_path = os.path.join(save_path, p_name)
        
        if not os.path.exists(save_path):
            print ("creating directory {} ".format(save_path))
            os.makedirs(save_path)
        
        image_path = os.path.join(save_path, str(image_number))+"."+extension
        with open(image_path, 'wb') as image_file:
            image_file.write(raw_image)
            
    def download_images_to_dir(self,images,p_name,p_occupation,p_country):
        for i, (url, image_type) in enumerate(images):
            try:
                print ("Making request ({}/{}): {}".format(i+1, self.num_images, url))
                raw_image = self.get_raw_image(url)
                if (raw_image is None):
                    continue
                self.save_image(raw_image, image_type,i,p_name,p_occupation,p_country)
            except Exception as e:
                print (e)
    
    def run(self,p_names,p_occuaptions,p_countries):
#         print ("Extracting image links")
        for p_name,p_occupation,p_country in zip(p_names,p_occuaptions,p_countries):
            print ("=========>Starting scraping for {} ({} of {}) ".format(p_name,p_occupation,p_country))
            images = self.extract_images(p_name,p_occupation,p_country)
            print("Downloading images")
            self.download_images_to_dir(images,p_name,p_occupation,p_country)
            print("Finished")

In [13]:
#run like this
a=google_image_scrapper(num_images=15, output_dir="./images")
p_names=["fawad khan","mehwish hayat","adnan siddique"]
p_occupations=["actor" for _ in range(3)]
p_countries=["Pakistan" for _ in range(3)]

a.run(p_names,p_occupations,p_countries)

Souping
Extracting image urls
Downloading images
Making request (1/15): https://akm-img-a-in.tosshub.com/indiatoday/images/story/201610/fawadurgent-story_647_100716065947.jpg
creating directory ./images/Pakistan/actor/fawad khan 
Making request (2/15): https://images.indianexpress.com/2016/03/fawad-khan-m.jpg
Making request (3/15): http://i1128.photobucket.com/albums/m495/rinakanif/2013%20new/2013%20new002/2013%20new002001/fkkk.jpg~original
Making request (4/15): http://www.samaa.tv/wp-content/uploads/2016/10/fawad-khan-2-e1475849210146.jpg
Making request (5/15): https://c.tribune.com.pk/2016/10/1195127-FawadKhan-1475848623.JPG
Making request (6/15): https://www.newspakistan.tv/wp-content/uploads/2016/09/fawad.jpg
Making request (7/15): https://c.tribune.com.pk/2018/02/1635624-fawadkhan-1518686390-698-640x480.jpg
Making request (8/15): https://www.shughal.com/wp-content/uploads/2015/03/hottest.jpg
Making request (9/15): https://folder.pk/wp-content/uploads/2017/11/fawad-khan.png
Making