Skip to content

Commit

Permalink
Improved logging, minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ARDivekar committed Feb 22, 2017
1 parent 92a1ed2 commit de5297b
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
14 changes: 8 additions & 6 deletions SearchDistribute/Distribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ def _get_index_of_coolest_worker(self):
if self.workers[i].time_of_last_retrieved_query < time_of_last_retrieved_query_of_coolest_worker:
index_of_coolest_worker = i
time_of_last_retrieved_query_of_coolest_worker = self.workers[i].time_of_last_retrieved_query
time_passed_since_last_fetched_from_coolest_worker = time.time() - self.workers[
index_of_coolest_worker].time_of_last_retrieved_query
time_passed_since_last_fetched_from_coolest_worker = time.time() - self.workers[index_of_coolest_worker].time_of_last_retrieved_query
return (index_of_coolest_worker, time_passed_since_last_fetched_from_coolest_worker)


Expand All @@ -185,7 +184,9 @@ def distribute_query(self, query, num_results, num_workers, num_results_per_page
url = worker._update_url_number_of_results_per_page(worker._update_url_start(basic_url, start_offset_so_far), num_results_per_page))
parsed_serps.append(parsed_serp)
start_offset_so_far += parsed_serps[-1].num_results
print("Results %s-%s\n%s\n\n" % (start_offset_so_far-parsed_serps[-1].num_results, start_offset_so_far, parsed_serps[-1].results))
now = datetime.datetime.now()
time_str = "%s-%s-%s %s:%s:%s" % (now.year, now.month, now.day, now.hour, now.minute, now.second)
print("Results %s-%s (obtained at %s)\n%s\n\n" % (start_offset_so_far - parsed_serps[-1].num_results, start_offset_so_far, time_str, parsed_serps[-1].results))
self.workers.append(worker) ## Can be extended to use multithreading or multiprocessing.

num_completed = start_offset_so_far
Expand All @@ -194,16 +195,17 @@ def distribute_query(self, query, num_results, num_workers, num_results_per_page
index_of_coolest_worker, time_passed_since_last_fetched_from_coolest_worker = self._get_index_of_coolest_worker()
if time_passed_since_last_fetched_from_coolest_worker < cooldown_time:
sleep_for = cooldown_time - time_passed_since_last_fetched_from_coolest_worker ## Sleep for the remaining time.
now = datetime.datetime.now()
wakeup_datetime = now + datetime.timedelta(seconds=sleep_for) ## Source: http://stackoverflow.com/a/3240493/4900327
wakeup_datetime = datetime.datetime.now() + datetime.timedelta(seconds=sleep_for) ## Source: http://stackoverflow.com/a/3240493/4900327
time_str = "%s-%s-%s %s:%s:%s"%(wakeup_datetime.year, wakeup_datetime.month, wakeup_datetime.day, wakeup_datetime.hour, wakeup_datetime.minute, wakeup_datetime.second)
print("<-----All workers need to cooldown, sleeping till: %s----->" % time_str)
time.sleep(sleep_for)
parsed_serps.append(
self.workers[index_of_coolest_worker].get_SERP_results(
basic_url, num_completed, num_results_per_page, save_to_db))
num_completed += parsed_serps[-1].num_results
print("Results %s-%s\n%s\n\n" % (num_completed-parsed_serps[-1].num_results, num_completed, parsed_serps[-1].results))
now = datetime.datetime.now()
time_str = "%s-%s-%s %s:%s:%s" % (now.year, now.month, now.day, now.hour, now.minute, now.second)
print("Results %s-%s (obtained at %s)\n%s\n\n" % (num_completed-parsed_serps[-1].num_results, num_completed, time_str, parsed_serps[-1].results))

return parsed_serps

Expand Down
22 changes: 18 additions & 4 deletions SearchDistribute/SERPParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
import urllib

class GoogleParser:
''' If any of the attributes are `None`, that attribute was not present on the SERP.
'''
current_url = ""
''' If any of the attributes are `None`, that attribute was not present on the SERP.'''
start_offset = -1
num_results = -1
current_url = ""
domain = ""
protocol = ""
results = []
num_results = -1
total_num_results_for_query = -1
query_retrieval_time_in_seconds = -1.0
link_to_previous_page = ""
Expand Down Expand Up @@ -153,6 +152,21 @@ def _parse_location(self, bs):
return None


def __str__(self):
out = ""
current_url = ""
start_offset = -1
domain = ""
protocol = ""
results = []
num_results = -1
total_num_results_for_query = -1
query_retrieval_time_in_seconds = -1.0
link_to_previous_page = ""
links_to_previous_pages = []
links_to_next_pages = []
link_to_next_page = ""
location = ""



Expand Down
4 changes: 2 additions & 2 deletions tests/DistributeTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"country" : "IND",
"query" : "site:washingtonpost.com husky",
"num_workers" : 10,
"num_results" : 100,
"num_results" : 300,
"num_results_per_page" : 10,
"cooldown_time" : 300,
"proxy_browser_config" : {
Expand All @@ -28,4 +28,4 @@
}

d = Distribute(config)
d.start()
parsed_serps = d.start()

0 comments on commit de5297b

Please sign in to comment.