Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandhunt committed Jan 27, 2020
1 parent 5d6d065 commit 9a315c8
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion scraper.py
Expand Up @@ -213,6 +213,20 @@ def doeshtmlelementexist(selectedel):
#print(json.dumps(jsonscrapsites))
for scrapsite in jsonscrapsites:
#print(json.dumps(scrapsite))
# --> Check if any product import values should be pre-fetched from the domain misc.
incr_link = ''
incr_link_startnumber = ''
if scrapsite['scrapefield']['domainmisc']:
output = re.search(r'({incr_link}(.*?))\{', scrapsite['scrapefield']['domainmisc'])
if len(output.group(1)) > 0:
incr_link = output.group(2)
scrapsite['scrapefield']['domainmisc'] = re.sub(r'({incr_link}.*?)\{', '', scrapsite['scrapefield']['domainmisc'])
output = re.search(r'({incr_link_startnumber}(.*?))\{', scrapsite['scrapefield']['domainmisc'])
if len(output.group(1)) > 0:
incr_link_startnumber = output.group(2)
scrapsite['scrapefield']['domainmisc'] = re.sub(r'({incr_link_startnumber}.*?)\{', '', scrapsite['scrapefield']['domainmisc'])
else:
incr_link_startnumber = '0'
# --> Ignore current product import URL if neccessary!
if scrapsite['scrapefield']['productignorethisone'] == '1':
continue
Expand Down Expand Up @@ -257,7 +271,7 @@ def doeshtmlelementexist(selectedel):
else:
browser.driver.set_script_timeout(300)
try:
browser.visit(scrapsite['scrapeurl'])
browser.visit(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
time.sleep(2)
html_source = browser.html
if scrapsite['scrapefield']['phantomjsimport'] != 'phantomjsimport_pagenumber_alt':
Expand All @@ -281,6 +295,9 @@ def doeshtmlelementexist(selectedel):
cur_scrollheight = browser.execute_script("return document.body.scrollHeight")
new_scrollheight = cur_scrollheight
while exists is True:
if incr_link != '':
browser.visit(scrapsite['scrapeurl'] + incr_link + incr_link_startnumber)
incr_link_startnumber = str(int(incr_link_startnumber) + 1)
if onlyScrollDown is False:
#browser.find_by_css(scrapsite['scrapefield']['productloadmoreselector']).first.click()
click_el = browser.driver.find_element_by_css_selector(scrapsite['scrapefield']['productloadmoreselector'])
Expand Down

0 comments on commit 9a315c8

Please sign in to comment.