1
+ from multiprocessing import Pool
2
+ import os
3
+ from datetime import datetime
4
+ import lxml .html as html
5
+ import pandas as pd
6
+ import time
7
+ from selenium import webdriver
8
+ from selenium .webdriver .chrome .options import Options
9
+ import warnings
10
+ import requests
11
+ warnings .filterwarnings ("ignore" )
12
+
13
+ class SeleniumScraper :
14
+ def __init__ (self , timeout = 10 ):
15
+ self .timeout = timeout
16
+ self .reqSession = requests .Session ()
17
+ self .stamp = datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
18
+ self .storagePath = os .path .join (
19
+ os .path .dirname (os .path .abspath (__file__ ))
20
+ )
21
+
22
+ self .headers = {
23
+ 'authority' : 'www.amazon.com' ,
24
+ 'pragma' : 'no-cache' ,
25
+ 'cache-control' : 'no-cache' ,
26
+ 'dnt' : '1' ,
27
+ 'upgrade-insecure-requests' : '1' ,
28
+ 'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36' ,
29
+ 'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' ,
30
+ 'sec-fetch-site' : 'none' ,
31
+ 'sec-fetch-mode' : 'navigate' ,
32
+ 'sec-fetch-dest' : 'document' ,
33
+ 'accept-language' : 'en-GB,en-US;q=0.9,en;q=0.8' ,
34
+ }
35
+
36
+ def fetch_request_normal (self , url , params = None ):
37
+ try :
38
+ headers = {
39
+ "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
40
+ }
41
+ response = self .reqSession .get (url , headers = headers )
42
+
43
+ if response .status_code == 200 :
44
+ return response .text
45
+
46
+ if response .status_code == 301 :
47
+ # retry with redirect
48
+ response = requests .get (response .headers ['Location' ])
49
+ response .raise_for_status ()
50
+ if response .status_code == 200 :
51
+ return response .text
52
+
53
+ if response .status_code == 503 :
54
+ #print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
55
+ return None
56
+
57
+ except Exception as e :
58
+ print (
59
+ "Exception occurred for url: {} and exception: {}" .format (url , e )
60
+ )
61
+ print ("Exception occurred for url: {} and exception: {}" .format (url , e ))
62
+ pass
63
+ return None
64
+
65
+ def get_xpath_link (self , doc , xpath , website ):
66
+ try :
67
+ name = doc .xpath ("" .join (xpath ))
68
+ for i in range (len (name )):
69
+ if name [i ].startswith ("/" ):
70
+ name [i ] = website + name [i ]
71
+ else :
72
+ name [i ] = name [i ]
73
+ return name
74
+
75
+ except Exception as e :
76
+ print ("Error in getting {}: {}" .format (name , e ))
77
+ pass
78
+ return None
79
+ pass
80
+
81
+ def get_selenium_driver (self ):
82
+ chrome_options = Options ()
83
+ chrome_options .add_argument ("--headless" )
84
+ chrome_options .add_argument ("--window-size=1920,1080" )
85
+ chrome_options .add_argument ("--disable-gpu" )
86
+ chrome_options .add_argument ("--no-sandbox" )
87
+ chrome_options .add_argument ("--disable-dev-shm-usage" )
88
+ chrome_options .add_argument ("--disable-extensions" )
89
+ chrome_options .add_argument ("--disable-logging" )
90
+ chrome_options .add_argument ("--log-level=3" )
91
+ chrome_options .add_argument ("--silent" )
92
+ chrome_options .add_argument ("--blink-settings=imagesEnabled=false" )
93
+ driver = webdriver .Chrome (chrome_options = chrome_options )
94
+ return driver
95
+
96
+ def fetch_request_selenium (self , url , waiting_time = 1 ):
97
+ try :
98
+ driver = self .get_selenium_driver ()
99
+ driver .get (url )
100
+ time .sleep (waiting_time )
101
+ doc = html .fromstring (driver .page_source )
102
+ driver .close ()
103
+ return doc
104
+
105
+ except Exception as e :
106
+ print (
107
+ "Exception occurred for url: {} and exception: {}" .format (url , e )
108
+ )
109
+ pass
110
+
111
+ def get_xpath_data (self , doc , xpath ):
112
+ try :
113
+ name = doc .xpath (xpath )
114
+ return name
115
+
116
+ except Exception as e :
117
+ print ("Error in getting {}: {}" .format (name , e ))
118
+ pass
119
+ return None
120
+
121
+ def slow_page_scroll (self , driver , speed ):
122
+ current_scroll_position = driver .execute_script ("return window.pageYOffset;" )
123
+ while current_scroll_position < driver .execute_script (
124
+ "return document.body.scrollHeight;"
125
+ ):
126
+ driver .execute_script (
127
+ "window.scrollTo(0, arguments[0]);" , current_scroll_position
128
+ )
129
+ current_scroll_position += 1000
130
+ time .sleep (speed )
131
+
132
+ def data_storage (self , df_list , unique_id , name , storageFormat , storagePath = None ):
133
+ df_combined = pd .concat (df_list , ignore_index = True )
134
+ df_combined .drop_duplicates (subset = unique_id , inplace = True )
135
+ if storageFormat == "csv" :
136
+ df_combined .to_csv (
137
+ self .storagePath + "/{}_{}.csv" .format (name , self .stamp ),
138
+ index = False ,
139
+ )
140
+ elif storageFormat == "json" :
141
+ df_combined .to_json (
142
+ self .storagePath + "/{}_{}.json" .format (name , self .stamp ),
143
+ orient = "records" ,
144
+ )
145
+
146
+ def cleanData (self , array ):
147
+ array = [x .strip () for x in array ]
148
+ array = list (filter (None , array ))
149
+ array = [x .encode ("ascii" , "ignore" ).decode () for x in array ]
150
+ array = [x .replace ("\n " , "" ) for x in array ]
151
+ return array
152
+
153
+
0 commit comments