In [1]:
import argparse
import json
import itertools
import logging
import re
import os
import uuid
import sys
from urllib.request import urlopen, Request

from bs4 import BeautifulSoup

In [59]:
class google_image_scrapper:
    def __init__(self,num_images,output_dir):
        self.REQUEST_HEADER = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
        self.base_url="https://www.google.co.in/search?q={}&source=lnms&tbm=isch"
        self.num_images=num_images
        self.output_dir=output_dir
        self.timeout=2
        
    def get_soup(self,url):
        response = urlopen(Request(url, headers=self.REQUEST_HEADER))
        return BeautifulSoup(response, 'html.parser')
    
    def get_query_url(self,search_query):
        return self.base_url.format("+".join(search_query.split()))
    
    def extract_images_from_soup(self,soup):
        image_elements = soup.find_all("div", {"class": "rg_meta"})
        metadata_dicts = (json.loads(e.text) for e in image_elements)
        link_type_records = ((d["ou"], d["ity"]) for d in metadata_dicts)
        return link_type_records
    
    def extract_images(self,query):
        url = self.get_query_url(query)
        print ("Souping")
        soup = self.get_soup(url)
        print ("Extracting image urls")
        link_type_records = self.extract_images_from_soup(soup)
        return itertools.islice(link_type_records, self.num_images+5)
    
    def get_raw_image(self,url):
        req = Request(url, headers=REQUEST_HEADER)
        try:
            resp = urlopen(req,timeout=self.timeout)
            resp=resp.read()
        except timeout :
            print ("timeout occured")
            resp=None
        except (HTTPError, URLError) as error:
            print ('Data of %s not retrieved because %s\nURL: %s', name, error, url)
            resp=None
        return resp
    
    def save_image(self,raw_image, image_type,image_number,dir_name):
        extension = image_type if image_type else 'jpg'
        save_path = os.path.join(self.output_dir, dir_name)
        
        if not os.path.exists(save_path):
            print ("creating directory {} ".format(save_path))
            os.makedirs(save_path)
        
        save_path = os.path.join(save_path, str(image_number))
        with open(save_path, 'wb') as image_file:
            image_file.write(raw_image)
            
    def download_images_to_dir(self,images,dir_name):
        for i, (url, image_type) in enumerate(images):
            try:
                print ("Making request ({}/{}): {}".format(i+1, self.num_images, url))
                raw_image = self.get_raw_image(url)
                if (raw_image is None):
                    continue
                self.save_image(raw_image, image_type,i,dir_name)
            except Exception as e:
                print (e)
    
    def run(self,query,dir_name):
        print ("Extracting image links")
        images = self.extract_images(query)
        print("Downloading images")
        self.download_images_to_dir(images,dir_name)
        print("Finished")

In [60]:
# def get_soup(url, header):
#     response = urlopen(Request(url, headers=header))
#     return BeautifulSoup(response, 'html.parser')

In [61]:
# def 

In [62]:
a=google_image_scrapper(num_images=15, output_dir="./actors")

In [63]:
a.run("actor humayun saeed","humayun_saeed/")

Extracting image links
Souping


[2018-10-31 11:13:43,745 DEBUG sjisprober]: SHIFT_JIS Japanese prober hit error at byte 314113
[2018-10-31 11:13:44,646 DEBUG eucjpprober]: EUC-JP Japanese prober hit error at byte 314111
[2018-10-31 11:13:45,204 DEBUG mbcharsetprober]: GB2312 Chinese prober hit error at byte 314113
[2018-10-31 11:13:45,805 DEBUG mbcharsetprober]: EUC-KR Korean prober hit error at byte 314111
[2018-10-31 11:13:46,391 DEBUG mbcharsetprober]: CP949 Korean prober hit error at byte 314111
[2018-10-31 11:13:46,915 DEBUG mbcharsetprober]: Big5 Chinese prober hit error at byte 314112
[2018-10-31 11:13:47,435 DEBUG mbcharsetprober]: EUC-TW Taiwan prober hit error at byte 314111
[2018-10-31 11:13:49,526 DEBUG charsetgroupprober]: windows-1251 Russian confidence = 0.01
[2018-10-31 11:13:49,527 DEBUG charsetgroupprober]: KOI8-R Russian confidence = 0.01
[2018-10-31 11:13:49,527 DEBUG charsetgroupprober]: ISO-8859-5 Russian confidence = 0.0
[2018-10-31 11:13:49,528 DEBUG charsetgroupprober]: MacCyrillic Russian co

Extracting image urls
Downloading images
Making request (0/15): http://www.pak101.com/gallery/ActorTv/Humayun_Saeed/2014/9/4/Humayun_Saeed_Pakistani_Male_Television_Actor_Celebrity_8_ltqlj_Pak101(dot)com.jpg
creating directory ./actors/humayun_saeed/ 
Making request (1/15): https://i.ytimg.com/vi/DZ6tt-swXV8/maxresdefault.jpg
name 'timeout' is not defined
Making request (2/15): https://www.brandsynario.com/wp-content/uploads/humayun.png
Making request (3/15): http://i.dawn.com/large/2015/11/56459323b08da.jpg
Making request (4/15): http://www.fashionuniverse.net/wp-content/uploads/2016/09/14102407_647683002058089_6573988816502484003_n.jpg
Making request (5/15): https://style.pk/wp-content/uploads/2012/06/Top-Actor-Humayun-Saeed-Full-Biography-0013.jpg
Making request (6/15): http://www.pakshowbiz.com/wp-content/uploads/2017/01/Humayun-Saeed-Father.jpg
Making request (7/15): https://cache.pakistantoday.com.pk/Humayun-Saeed-Ushna-Shah-Secret-Affair.jpg
Making request (8/15): https://c.trib

In [39]:
os.makedirs("./actor/jenni")