From b865d98a595f1e1aa023050387ca0c75f31bcc80 Mon Sep 17 00:00:00 2001 From: mariam Date: Tue, 16 May 2017 09:23:52 +0100 Subject: [PATCH 1/6] Add health scraper config --- healthtools/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/healthtools/config.py b/healthtools/config.py index 8765137..456cc4b 100644 --- a/healthtools/config.py +++ b/healthtools/config.py @@ -5,6 +5,7 @@ "DOCTORS": "http://medicalboard.co.ke/online-services/retention/?currpage={}", "FOREIGN_DOCTORS": "http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage={}", "CLINICAL_OFFICERS": "http://clinicalofficerscouncil.org/online-services/retention/?currpage={}", + "TOKEN_URL" : "http://api.kmhfl.health.go.ke/o/token/" } AWS = { @@ -15,6 +16,9 @@ "cloudsearch_doctors_endpoint": "http://doc-cfa-healthtools-ke-doctors-m34xee6byjmzcgzmovevkjpffy.eu-west-1.cloudsearch.amazonaws.com/", # Clinical document endpoint "cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/", + # Health facilities endpoint + "cloudsearch_health_faciities_endpoint":" https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com", + } TEST_DIR = os.getcwd() + "/healthtools/tests" From 64fa9915ae8c105f8af2428aa55b56c4bcbbbccd Mon Sep 17 00:00:00 2001 From: mariam Date: Tue, 16 May 2017 09:24:19 +0100 Subject: [PATCH 2/6] Add health facilities scraper --- .../scrapers/health_facilities_scraper.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 healthtools/scrapers/health_facilities_scraper.py diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py new file mode 100644 index 0000000..fdf0405 --- /dev/null +++ b/healthtools/scrapers/health_facilities_scraper.py @@ -0,0 +1,100 @@ +from healthtools.scrapers.base_scraper import Scraper +from healthtools.config import SITES, AWS +from datetime import datetime +import requests +import json +import boto3 + +TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/' +SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \ + 'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \ + 'owner_type,operation_status_name,county,constituency,constituency_name,ward_name,average_rating,' \ + 'facility_services,is_approved,has_edits,latest_update,regulatory_body_name,owner,date_requested,' \ + 'date_approved,latest_approval_or_rejection,sub_county_name,sub_county_id,county_name,constituency_id,' \ + 'county_id,keph_level_name,facility_contacts,coordinates,lat_long,latest_approval,county_code,constituency_code' \ + ',ward_code,service_catalogue_active,facility_units,officer_in_charge,created,updated,deleted,active,search,' \ + 'name,official_name,code,registration_number,abbreviation,description,number_of_beds,number_of_cots,' \ + 'open_whole_day,open_public_holidays,open_normal_day,open_weekends,open_late_night,is_classified,' \ + 'is_published,regulated,approved,rejected,bank_name,branch_name,bank_account,facility_catchment_population,' \ + 'town_name,nearest_landmark,plot_number,location_desc,closed,closed_date,closing_reason,date_established,' \ + 'license_number,created_by,updated_by,facility_type,operation_status,ward,parent,regulatory_body,' \ + 'keph_level,sub_county,town,regulation_status,contacts&format=json' + + +class HealthFacilitiesScraper(Scraper): + def __init__(self): + self.access_token = None + self.fields = [ + "name", "facility_type_name", "approved", "sub_county_name", + "service_names", "county_name", "open_public_holidays", + "keph_level_name", "open_whole_day", "owner_name", + "constituency_name", "regulatory_body_name", "operation_status_name", "open_late_night", "open_weekends", "ward_name" + ] + self.cloudsearch = boto3.client( + "cloudsearchdomain", **{ + "aws_access_key_id": AWS["aws_access_key_id"], + "aws_secret_access_key": AWS["aws_secret_access_key"], + "region_name": AWS["region_name"], + "endpoint_url": AWS["cloudsearch_doctors_endpoint"] + }) + + def get_token(self): + headers = {'Content-Type': 'application/x-www-form-urlencoded'} + data = { + 'username': 'public@mfltest.slade360.co.ke', + 'password': 'public', + 'grant_type': 'password', + 'client_id': 'xMddOofHI0jOKboVxdoKAXWKpkEQAP0TuloGpfj5', + 'client_secret': 'PHrUzCRFm9558DGa6Fh1hEvSCh3C9Lijfq8sbCMZhZqmANYV5ZP04mUXGJdsrZLXuZG4VCmvjShdKHwU6IRmPQld5LDzvJoguEP8AAXGJhrqfLnmtFXU3x2FO1nWLxUx' + } + r = requests.post(TOKEN_URL, data=data, headers=headers) + self.access_token = json.loads(r.text)['access_token'] + print self.access_token + + def get_data(self): + try: + headers = {'Authorization': 'Bearer ' + self.access_token} + r = requests.get(SEARCH_URL, headers=headers) + data = r.json() + print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason) + payload = '' + for i, record in enumerate(data['results']): + payload += self.index_for_cloudsearch(record) + ',' + #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch + if i % 100 == 0 or i == (len(data['results']) - 1): + payload = '[%s]' % payload[:-1] #remove last comma + print i + self.push_to_cloud_search(payload) + payload = '' + except Exception, err: + print "ERROR - index_for_search() - %s" % (err) + + def index_for_cloudsearch(self, record): + return index_template.health_facilities_template % ( + record['code'], + record['name'].replace("\"","'"), + record['facility_type_name'], + record['service_names'], + record['sub_county_name'], + record['service_names'], + record['county_name'], + record['open_public_holidays'], + record['keph_level_name'], + record['open_whole_day'], + record['owner_name'], + record['constituency_name'], + record['regulatory_body_name'], + record['operation_status_name'], + record['open_late_night'], + record['open_weekends'], + record['ward_name'].decode("string_escape").replace('\\',''), + ) + + def push_to_cloud_search(self, payload): + try: + response = self.cloudsearch.upload_documents( + documents=payload, contentType="application/json" + ) + print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status")) + except Exception, err: + print "ERROR - index_for_search() - %s - %s" % (len(payload), err) From dcd804fc15d63ad65a9dfa202c52e7367cff8c6a Mon Sep 17 00:00:00 2001 From: mariam Date: Tue, 16 May 2017 09:25:02 +0100 Subject: [PATCH 3/6] Add health scraper to scraper.py --- scraper.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 66d129a..86809a8 100644 --- a/scraper.py +++ b/scraper.py @@ -1,12 +1,14 @@ from healthtools.scrapers.doctors import DoctorsScraper from healthtools.scrapers.foreign_doctors import ForeignDoctorsScraper from healthtools.scrapers.clinical_officers import ClinicalOfficersScraper - +from healthtools.scrapers.health_facilities_scraper import HealthFacilitiesScraper if __name__ == "__main__": + healthfacilities_scraper = HealthFacilitiesScraper() doctors_scraper = DoctorsScraper() foreign_doctors_scraper = ForeignDoctorsScraper() - clinical_officers_scraper= ClinicalOfficersScraper() - + clinical_officers_scraper = ClinicalOfficersScraper() + healthfacilities_scraper.get_token() + healthfacilities_scraper.get_data() # scraping you softly with these bots... doctors_result = doctors_scraper.scrape_site() if doctors_result: From b10741f9ed63e1981bdfbec4fb9ddcf3f6557b17 Mon Sep 17 00:00:00 2001 From: mariam Date: Tue, 16 May 2017 09:43:49 +0100 Subject: [PATCH 4/6] Add scraper wiki config --- healthtools/scrapers/health_facilities_scraper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py index fdf0405..0b57064 100644 --- a/healthtools/scrapers/health_facilities_scraper.py +++ b/healthtools/scrapers/health_facilities_scraper.py @@ -4,6 +4,8 @@ import requests import json import boto3 +import scraperwiki +scraperwiki.config = { db: 'data.sqlite', default_table_name: 'data' } TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/' SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \ @@ -59,6 +61,7 @@ def get_data(self): print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason) payload = '' for i, record in enumerate(data['results']): + scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) payload += self.index_for_cloudsearch(record) + ',' #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch if i % 100 == 0 or i == (len(data['results']) - 1): @@ -98,3 +101,4 @@ def push_to_cloud_search(self, payload): print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status")) except Exception, err: print "ERROR - index_for_search() - %s - %s" % (len(payload), err) + From 5250b145c0842bd7750989b4328fda62d24cf588 Mon Sep 17 00:00:00 2001 From: mariam Date: Wed, 17 May 2017 11:33:03 +0100 Subject: [PATCH 5/6] refactor fields --- .../scrapers/health_facilities_scraper.py | 58 +++++++++++-------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py index 0b57064..92d1c64 100644 --- a/healthtools/scrapers/health_facilities_scraper.py +++ b/healthtools/scrapers/health_facilities_scraper.py @@ -1,12 +1,30 @@ -from healthtools.scrapers.base_scraper import Scraper -from healthtools.config import SITES, AWS -from datetime import datetime -import requests import json +from healthtools.config import AWS +import requests import boto3 -import scraperwiki -scraperwiki.config = { db: 'data.sqlite', default_table_name: 'data' } +health_facilities_template = """ + {"type": "add", + "id": "%s", + "fields": { + "name": "%s", + "facility_type_name": "%s", + "approved": "%s", + "sub_county_name": "%s", + "service_names": "%s", + "county_name": "%s", + "open_public_holidays": "%s", + "keph_level_name": "%s", + "open_whole_day": "%s", + "owner_name": "%s", + "constituency_name": "%s", + "regulatory_body_name": "%s", + "operation_status_name": "%s", + "open_late_night": "%s", + "open_weekends": "%s", + "ward_name": "%s" + } + }""" TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/' SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \ 'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \ @@ -23,15 +41,9 @@ 'keph_level,sub_county,town,regulation_status,contacts&format=json' -class HealthFacilitiesScraper(Scraper): +class HealthFacilitiesScraper(object): def __init__(self): self.access_token = None - self.fields = [ - "name", "facility_type_name", "approved", "sub_county_name", - "service_names", "county_name", "open_public_holidays", - "keph_level_name", "open_whole_day", "owner_name", - "constituency_name", "regulatory_body_name", "operation_status_name", "open_late_night", "open_weekends", "ward_name" - ] self.cloudsearch = boto3.client( "cloudsearchdomain", **{ "aws_access_key_id": AWS["aws_access_key_id"], @@ -51,7 +63,6 @@ def get_token(self): } r = requests.post(TOKEN_URL, data=data, headers=headers) self.access_token = json.loads(r.text)['access_token'] - print self.access_token def get_data(self): try: @@ -61,23 +72,22 @@ def get_data(self): print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason) payload = '' for i, record in enumerate(data['results']): - scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) payload += self.index_for_cloudsearch(record) + ',' #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch if i % 100 == 0 or i == (len(data['results']) - 1): payload = '[%s]' % payload[:-1] #remove last comma - print i + # print i self.push_to_cloud_search(payload) payload = '' except Exception, err: - print "ERROR - index_for_search() - %s" % (err) + print "ERROR IN - index_for_search() - %s" % (err) def index_for_cloudsearch(self, record): - return index_template.health_facilities_template % ( + return health_facilities_template % ( record['code'], record['name'].replace("\"","'"), record['facility_type_name'], - record['service_names'], + record['approved'], record['sub_county_name'], record['service_names'], record['county_name'], @@ -95,10 +105,10 @@ def index_for_cloudsearch(self, record): def push_to_cloud_search(self, payload): try: - response = self.cloudsearch.upload_documents( - documents=payload, contentType="application/json" - ) - print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status")) + response = self.cloudsearch.upload_documents( + documents=payload, contentType="application/json" + ) + print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status")) except Exception, err: - print "ERROR - index_for_search() - %s - %s" % (len(payload), err) + print "ERROR - index_for_search() - %s - %s" % (len(payload), err) From 4c15ea8beeea46e7ecc59a54418e77eae928c13d Mon Sep 17 00:00:00 2001 From: mariam Date: Wed, 17 May 2017 12:11:05 +0100 Subject: [PATCH 6/6] Add fix for endpoint-url --- healthtools/config.py | 2 +- healthtools/scrapers/health_facilities_scraper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/healthtools/config.py b/healthtools/config.py index 456cc4b..ca87fe7 100644 --- a/healthtools/config.py +++ b/healthtools/config.py @@ -17,7 +17,7 @@ # Clinical document endpoint "cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/", # Health facilities endpoint - "cloudsearch_health_faciities_endpoint":" https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com", + "cloudsearch_health_faciities_endpoint":"https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com", } diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py index 92d1c64..431df87 100644 --- a/healthtools/scrapers/health_facilities_scraper.py +++ b/healthtools/scrapers/health_facilities_scraper.py @@ -49,7 +49,7 @@ def __init__(self): "aws_access_key_id": AWS["aws_access_key_id"], "aws_secret_access_key": AWS["aws_secret_access_key"], "region_name": AWS["region_name"], - "endpoint_url": AWS["cloudsearch_doctors_endpoint"] + "endpoint_url": AWS["cloudsearch_health_faciities_endpoint"] }) def get_token(self):