Merge pull request #4 from andela-mabdussalam/master

Health Facilities Scraper
CodeForAfrica-SCRAPERS · May 17, 2017 · 57390ca · 57390ca
2 parents 8997cba + 4c15ea8
commit 57390ca
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 3 deletions.
diff --git a/healthtools/config.py b/healthtools/config.py
@@ -5,6 +5,7 @@
     "DOCTORS": "http://medicalboard.co.ke/online-services/retention/?currpage={}",
     "FOREIGN_DOCTORS": "http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage={}",
     "CLINICAL_OFFICERS": "http://clinicalofficerscouncil.org/online-services/retention/?currpage={}",
+    "TOKEN_URL" : "http://api.kmhfl.health.go.ke/o/token/"
 }
 
 AWS = {
@@ -15,6 +16,9 @@
     "cloudsearch_doctors_endpoint": "http://doc-cfa-healthtools-ke-doctors-m34xee6byjmzcgzmovevkjpffy.eu-west-1.cloudsearch.amazonaws.com/",
     # Clinical document endpoint
     "cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/",
+    # Health facilities endpoint
+    "cloudsearch_health_faciities_endpoint":"https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com",
+
 }
 
 TEST_DIR = os.getcwd() + "/healthtools/tests"
diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py
@@ -0,0 +1,114 @@
+import json
+from healthtools.config import AWS
+import requests
+import boto3
+
+health_facilities_template = """
+    {"type": "add",
+     "id":   "%s",
+     "fields": {
+              "name": "%s",
+              "facility_type_name": "%s",
+              "approved": "%s",
+              "sub_county_name": "%s",
+              "service_names": "%s",
+              "county_name": "%s",
+              "open_public_holidays": "%s",
+              "keph_level_name": "%s",
+              "open_whole_day": "%s",
+              "owner_name": "%s",
+              "constituency_name": "%s",
+              "regulatory_body_name": "%s",
+              "operation_status_name": "%s",
+              "open_late_night": "%s",
+              "open_weekends": "%s",
+              "ward_name": "%s"
+            }
+     }"""
+TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/'
+SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \
+             'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \
+             'owner_type,operation_status_name,county,constituency,constituency_name,ward_name,average_rating,' \
+             'facility_services,is_approved,has_edits,latest_update,regulatory_body_name,owner,date_requested,' \
+             'date_approved,latest_approval_or_rejection,sub_county_name,sub_county_id,county_name,constituency_id,' \
+             'county_id,keph_level_name,facility_contacts,coordinates,lat_long,latest_approval,county_code,constituency_code' \
+             ',ward_code,service_catalogue_active,facility_units,officer_in_charge,created,updated,deleted,active,search,' \
+             'name,official_name,code,registration_number,abbreviation,description,number_of_beds,number_of_cots,' \
+             'open_whole_day,open_public_holidays,open_normal_day,open_weekends,open_late_night,is_classified,' \
+             'is_published,regulated,approved,rejected,bank_name,branch_name,bank_account,facility_catchment_population,' \
+             'town_name,nearest_landmark,plot_number,location_desc,closed,closed_date,closing_reason,date_established,' \
+             'license_number,created_by,updated_by,facility_type,operation_status,ward,parent,regulatory_body,' \
+             'keph_level,sub_county,town,regulation_status,contacts&format=json'
+
+
+class HealthFacilitiesScraper(object):
+    def __init__(self):
+        self.access_token = None
+        self.cloudsearch = boto3.client(
+            "cloudsearchdomain", **{
+                "aws_access_key_id": AWS["aws_access_key_id"],
+                "aws_secret_access_key": AWS["aws_secret_access_key"],
+                "region_name": AWS["region_name"],
+                "endpoint_url": AWS["cloudsearch_health_faciities_endpoint"]
+            })
+
+    def get_token(self):
+        headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+        data = {
+            'username': 'public@mfltest.slade360.co.ke',
+            'password': 'public',
+            'grant_type': 'password',
+            'client_id': 'xMddOofHI0jOKboVxdoKAXWKpkEQAP0TuloGpfj5',
+            'client_secret': 'PHrUzCRFm9558DGa6Fh1hEvSCh3C9Lijfq8sbCMZhZqmANYV5ZP04mUXGJdsrZLXuZG4VCmvjShdKHwU6IRmPQld5LDzvJoguEP8AAXGJhrqfLnmtFXU3x2FO1nWLxUx'
+        }
+        r = requests.post(TOKEN_URL, data=data, headers=headers)
+        self.access_token = json.loads(r.text)['access_token']
+
+    def get_data(self):
+        try:
+            headers = {'Authorization': 'Bearer ' + self.access_token}
+            r = requests.get(SEARCH_URL, headers=headers)
+            data = r.json()
+            print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason)
+            payload = ''
+            for i, record in enumerate(data['results']):
+                payload += self.index_for_cloudsearch(record) + ','
+                #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch
+                if i % 100 == 0 or i == (len(data['results']) - 1):
+                    payload = '[%s]' % payload[:-1] #remove last comma
+                    # print i
+                    self.push_to_cloud_search(payload)
+                    payload = ''
+        except Exception, err:
+            print "ERROR IN - index_for_search() - %s" % (err)
+
+    def index_for_cloudsearch(self, record):
+        return health_facilities_template  % (
+            record['code'],
+            record['name'].replace("\"","'"),
+            record['facility_type_name'],
+            record['approved'],
+            record['sub_county_name'],
+            record['service_names'],
+            record['county_name'],
+            record['open_public_holidays'],
+            record['keph_level_name'],
+            record['open_whole_day'],
+            record['owner_name'],
+            record['constituency_name'],
+            record['regulatory_body_name'],
+            record['operation_status_name'],
+            record['open_late_night'],
+            record['open_weekends'],
+            record['ward_name'].decode("string_escape").replace('\\',''),
+      )
+
+    def push_to_cloud_search(self, payload):
+        try:
+            response = self.cloudsearch.upload_documents(
+                documents=payload, contentType="application/json"
+            )
+            print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
+        except Exception, err:
+            print "ERROR - index_for_search() - %s - %s" % (len(payload), err)
+
diff --git a/scraper.py b/scraper.py
@@ -1,12 +1,14 @@
 from healthtools.scrapers.doctors import DoctorsScraper
 from healthtools.scrapers.foreign_doctors import ForeignDoctorsScraper
 from healthtools.scrapers.clinical_officers import ClinicalOfficersScraper
-
+from healthtools.scrapers.health_facilities_scraper import HealthFacilitiesScraper
 if __name__ == "__main__":
+    healthfacilities_scraper = HealthFacilitiesScraper()
     doctors_scraper = DoctorsScraper()
     foreign_doctors_scraper = ForeignDoctorsScraper()
-    clinical_officers_scraper= ClinicalOfficersScraper()
-
+    clinical_officers_scraper = ClinicalOfficersScraper()
+    healthfacilities_scraper.get_token()
+    healthfacilities_scraper.get_data()
     # scraping you softly with these bots...
     doctors_result = doctors_scraper.scrape_site()
     if doctors_result: