Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandhunt committed Jan 27, 2020
1 parent eb7ea2a commit a3b53e5
Showing 1 changed file with 17 additions and 11 deletions.
28 changes: 17 additions & 11 deletions scraper.py
Expand Up @@ -216,8 +216,14 @@ def doeshtmlelementexist(selectedel):
# --> Check if any product import values should be pre-fetched from the domain misc.
incr_link = ''
incr_link_startnumber = ''
override_timeout = ''
if scrapsite['scrapefield']['domainmisc']:
#print(scrapsite['scrapefield']['domainmisc'])
output = re.search(r'({override_timeout}(.*?))\{', scrapsite['scrapefield']['domainmisc'])
if output is not None and len(output.group(1)) > 0:
incr_link = output.group(2)
scrapsite['scrapefield']['domainmisc'] = re.sub(r'({override_timeout}.*?(?=\{))', '', scrapsite['scrapefield']['domainmisc'])
#print(scrapsite['scrapefield']['domainmisc'])
output = re.search(r'({incr_link}(.*?))\{', scrapsite['scrapefield']['domainmisc'])
if output is not None and len(output.group(1)) > 0:
incr_link = output.group(2)
Expand Down Expand Up @@ -276,8 +282,8 @@ def doeshtmlelementexist(selectedel):
browser.driver.set_script_timeout(300)
try:
browser.visit(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
print(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
time.sleep(10)
#print(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
time.sleep(2)
html_source = browser.html
if scrapsite['scrapefield']['phantomjsimport'] != 'phantomjsimport_pagenumber_alt':
temp_root = lxml.html.fromstring(html_source)
Expand All @@ -291,9 +297,9 @@ def doeshtmlelementexist(selectedel):
else:
onlyScrollDown = True
exists = True
timeout = 300 # <-- Amount of seconds to run the whole thing
clickTime = 10 # <--- Amount of time to wait between each click
scrollTime = 10 # <--- Amount of time to wait between each scroll
timeout = 300 if override_timeout == '' else int(override_timeout) # <-- Amount of seconds to run the whole thing
clickTime = 2 # <--- Amount of time to wait between each click
scrollTime = 0.5 # <--- Amount of time to wait between each scroll
start_time = datetime.now()
#browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#time.sleep(scrollTime)
Expand All @@ -302,8 +308,8 @@ def doeshtmlelementexist(selectedel):
while exists is True:
if incr_link != '':
browser.visit(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
print(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
time.sleep(10)
#print(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
time.sleep(2)
incr_link_startnumber = str(int(incr_link_startnumber) + 1)
if onlyScrollDown is False:
#browser.find_by_css(scrapsite['scrapefield']['productloadmoreselector']).first.click()
Expand Down Expand Up @@ -337,15 +343,15 @@ def doeshtmlelementexist(selectedel):
if prod is not None:
products.append(str(etree.tostring(prod)))
childrenCountNew = len(products)
print('chCOUNT: ' + str(childrenCount))
print('chCOUNTNEW: ' + str(childrenCountNew))
#print('chCOUNT: ' + str(childrenCount))
#print('chCOUNTNEW: ' + str(childrenCountNew))
exists = doeshtmlelementexist(temp_root.cssselect(scrapsite['scrapefield']['productloadmoreselector'])) if onlyScrollDown is False else False
if scrapsite['scrapefield']['phantomjsimport'] == 'phantomjsimport_scroll_loadmore_wait':
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scrollTime)
new_scrollheight = browser.execute_script("return document.body.scrollHeight")
print('scrollh: ' + str(cur_scrollheight))
print('scrollhNEW: ' + str(new_scrollheight))
#print('scrollh: ' + str(cur_scrollheight))
#print('scrollhNEW: ' + str(new_scrollheight))
if new_scrollheight == cur_scrollheight:
break
cur_scrollheight = new_scrollheight
Expand Down

0 comments on commit a3b53e5

Please sign in to comment.