-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpython_image_extract.py
75 lines (58 loc) · 2.09 KB
/
python_image_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import time
import os
# selenium is library used for automation of web browser
from selenium import webdriver
# urllib is used used to fetch and perform operation on URL
import urllib.request
# These Keys are referring to keyboard keys
# for example Keys.ENTER will refer to "enter" button on keyboard
from selenium.webdriver.common.keys import Keys
# Used to download Chrome Driver
from webdriver_manager.chrome import ChromeDriverManager
# ChromeDriverManager will download and install chrome driver
browser = webdriver.Chrome(ChromeDriverManager().install())
# get() will open URL provided in Parentheis
browser.get("https://www.google.com/")
# q specifically point to the search box and find element whose name='q'
search = browser.find_element_by_name('q')
# Given text is put in search box and enter key of keyboard is pressed
search.send_keys("sunflower",Keys.ENTER)
# link text will find "given text" within an anchor tag and return first matching element
elem = browser.find_element_by_link_text('Images')
# get href value within anchor tag and perform click
elem.get_attribute('href')
elem.click()
# scroll down the page for loading all images
value = 0
for i in range(20):
browser.execute_script("scrollBy("+ str(value) +",+1000);")
value += 1000
time.sleep(3)
# google images contain in a div tag with is 'islmp'.It will get element by id having value 'islmp'.
elem1 = browser.find_element_by_id('islmp')
# It will find all elements whose div tag contains img
sub = elem1.find_elements_by_tag_name("img")
#It will create image directory to store all images
try:
os.mkdir('images')
except FileExistsError:
pass
# It will iterate all images stored in sub
count = 0
for i in sub:
# get source of current image
src = i.get_attribute('src')
# try catch block beacause some image url may be none
try:
if(src != None):
src = str(src)
print(src)
count+=1
# urllib.request.urlretrieve used to download images
urllib.request.urlretrieve(src, os.path.join('images','image'+str(count)+'.jpg'))
if(count==500):
break
else:
raise TypeError
except TypeError:
print('fail')