From b865d98a595f1e1aa023050387ca0c75f31bcc80 Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Tue, 16 May 2017 09:23:52 +0100
Subject: [PATCH 1/6] Add health scraper config

---
 healthtools/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/healthtools/config.py b/healthtools/config.py
index 8765137..456cc4b 100644
--- a/healthtools/config.py
+++ b/healthtools/config.py
@@ -5,6 +5,7 @@
     "DOCTORS": "http://medicalboard.co.ke/online-services/retention/?currpage={}",
     "FOREIGN_DOCTORS": "http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage={}",
     "CLINICAL_OFFICERS": "http://clinicalofficerscouncil.org/online-services/retention/?currpage={}",
+    "TOKEN_URL" : "http://api.kmhfl.health.go.ke/o/token/"
 }
 
 AWS = {
@@ -15,6 +16,9 @@
     "cloudsearch_doctors_endpoint": "http://doc-cfa-healthtools-ke-doctors-m34xee6byjmzcgzmovevkjpffy.eu-west-1.cloudsearch.amazonaws.com/",
     # Clinical document endpoint
     "cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/",
+    # Health facilities endpoint
+    "cloudsearch_health_faciities_endpoint":" https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com",
+
 }
 
 TEST_DIR = os.getcwd() + "/healthtools/tests"

From 64fa9915ae8c105f8af2428aa55b56c4bcbbbccd Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Tue, 16 May 2017 09:24:19 +0100
Subject: [PATCH 2/6] Add health facilities scraper

---
 .../scrapers/health_facilities_scraper.py     | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 healthtools/scrapers/health_facilities_scraper.py

diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py
new file mode 100644
index 0000000..fdf0405
--- /dev/null
+++ b/healthtools/scrapers/health_facilities_scraper.py
@@ -0,0 +1,100 @@
+from healthtools.scrapers.base_scraper import Scraper
+from healthtools.config import SITES, AWS
+from datetime import datetime
+import requests
+import json
+import boto3
+
+TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/'
+SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \
+             'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \
+             'owner_type,operation_status_name,county,constituency,constituency_name,ward_name,average_rating,' \
+             'facility_services,is_approved,has_edits,latest_update,regulatory_body_name,owner,date_requested,' \
+             'date_approved,latest_approval_or_rejection,sub_county_name,sub_county_id,county_name,constituency_id,' \
+             'county_id,keph_level_name,facility_contacts,coordinates,lat_long,latest_approval,county_code,constituency_code' \
+             ',ward_code,service_catalogue_active,facility_units,officer_in_charge,created,updated,deleted,active,search,' \
+             'name,official_name,code,registration_number,abbreviation,description,number_of_beds,number_of_cots,' \
+             'open_whole_day,open_public_holidays,open_normal_day,open_weekends,open_late_night,is_classified,' \
+             'is_published,regulated,approved,rejected,bank_name,branch_name,bank_account,facility_catchment_population,' \
+             'town_name,nearest_landmark,plot_number,location_desc,closed,closed_date,closing_reason,date_established,' \
+             'license_number,created_by,updated_by,facility_type,operation_status,ward,parent,regulatory_body,' \
+             'keph_level,sub_county,town,regulation_status,contacts&format=json'
+
+
+class HealthFacilitiesScraper(Scraper):
+    def __init__(self):
+        self.access_token = None
+        self.fields = [
+            "name", "facility_type_name", "approved", "sub_county_name",
+            "service_names", "county_name", "open_public_holidays",
+            "keph_level_name", "open_whole_day", "owner_name",
+            "constituency_name", "regulatory_body_name", "operation_status_name", "open_late_night", "open_weekends", "ward_name"
+        ]
+        self.cloudsearch = boto3.client(
+            "cloudsearchdomain", **{
+                "aws_access_key_id": AWS["aws_access_key_id"],
+                "aws_secret_access_key": AWS["aws_secret_access_key"],
+                "region_name": AWS["region_name"],
+                "endpoint_url": AWS["cloudsearch_doctors_endpoint"]
+            })
+
+    def get_token(self):
+        headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+        data = {
+            'username': 'public@mfltest.slade360.co.ke',
+            'password': 'public',
+            'grant_type': 'password',
+            'client_id': 'xMddOofHI0jOKboVxdoKAXWKpkEQAP0TuloGpfj5',
+            'client_secret': 'PHrUzCRFm9558DGa6Fh1hEvSCh3C9Lijfq8sbCMZhZqmANYV5ZP04mUXGJdsrZLXuZG4VCmvjShdKHwU6IRmPQld5LDzvJoguEP8AAXGJhrqfLnmtFXU3x2FO1nWLxUx'
+        }
+        r = requests.post(TOKEN_URL, data=data, headers=headers)
+        self.access_token = json.loads(r.text)['access_token']
+        print self.access_token
+
+    def get_data(self):
+        try:
+            headers = {'Authorization': 'Bearer ' + self.access_token}
+            r = requests.get(SEARCH_URL, headers=headers)
+            data = r.json()
+            print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason)
+            payload = ''
+            for i, record in enumerate(data['results']):
+                payload += self.index_for_cloudsearch(record) + ','
+                #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch
+                if i % 100 == 0 or i == (len(data['results']) - 1):
+                    payload = '[%s]' % payload[:-1] #remove last comma
+                    print i
+                    self.push_to_cloud_search(payload)
+                    payload = ''
+        except Exception, err:
+            print "ERROR - index_for_search() - %s" % (err)
+
+    def index_for_cloudsearch(self, record):
+        return index_template.health_facilities_template  % (
+            record['code'],
+            record['name'].replace("\"","'"),
+            record['facility_type_name'],
+            record['service_names'],
+            record['sub_county_name'],
+            record['service_names'],
+            record['county_name'],
+            record['open_public_holidays'],
+            record['keph_level_name'],
+            record['open_whole_day'],
+            record['owner_name'],
+            record['constituency_name'],
+            record['regulatory_body_name'],
+            record['operation_status_name'],
+            record['open_late_night'],
+            record['open_weekends'],
+            record['ward_name'].decode("string_escape").replace('\\',''),
+      )
+
+    def push_to_cloud_search(self, payload):
+        try:
+          response = self.cloudsearch.upload_documents(
+              documents=payload, contentType="application/json"
+          )
+        print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
+        except Exception, err:
+          print "ERROR - index_for_search() - %s - %s" % (len(payload), err)

From dcd804fc15d63ad65a9dfa202c52e7367cff8c6a Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Tue, 16 May 2017 09:25:02 +0100
Subject: [PATCH 3/6] Add health scraper to scraper.py

---
 scraper.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scraper.py b/scraper.py
index 66d129a..86809a8 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,12 +1,14 @@
 from healthtools.scrapers.doctors import DoctorsScraper
 from healthtools.scrapers.foreign_doctors import ForeignDoctorsScraper
 from healthtools.scrapers.clinical_officers import ClinicalOfficersScraper
-
+from healthtools.scrapers.health_facilities_scraper import HealthFacilitiesScraper
 if __name__ == "__main__":
+    healthfacilities_scraper = HealthFacilitiesScraper()
     doctors_scraper = DoctorsScraper()
     foreign_doctors_scraper = ForeignDoctorsScraper()
-    clinical_officers_scraper= ClinicalOfficersScraper()
-
+    clinical_officers_scraper = ClinicalOfficersScraper()
+    healthfacilities_scraper.get_token()
+    healthfacilities_scraper.get_data()
     # scraping you softly with these bots...
     doctors_result = doctors_scraper.scrape_site()
     if doctors_result:

From b10741f9ed63e1981bdfbec4fb9ddcf3f6557b17 Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Tue, 16 May 2017 09:43:49 +0100
Subject: [PATCH 4/6] Add scraper wiki config

---
 healthtools/scrapers/health_facilities_scraper.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py
index fdf0405..0b57064 100644
--- a/healthtools/scrapers/health_facilities_scraper.py
+++ b/healthtools/scrapers/health_facilities_scraper.py
@@ -4,6 +4,8 @@
 import requests
 import json
 import boto3
+import scraperwiki
+scraperwiki.config = { db: 'data.sqlite', default_table_name: 'data' }
 
 TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/'
 SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \
@@ -59,6 +61,7 @@ def get_data(self):
             print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason)
             payload = ''
             for i, record in enumerate(data['results']):
+                scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
                 payload += self.index_for_cloudsearch(record) + ','
                 #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch
                 if i % 100 == 0 or i == (len(data['results']) - 1):
@@ -98,3 +101,4 @@ def push_to_cloud_search(self, payload):
         print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
         except Exception, err:
           print "ERROR - index_for_search() - %s - %s" % (len(payload), err)
+

From 5250b145c0842bd7750989b4328fda62d24cf588 Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Wed, 17 May 2017 11:33:03 +0100
Subject: [PATCH 5/6] refactor fields

---
 .../scrapers/health_facilities_scraper.py     | 58 +++++++++++--------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py
index 0b57064..92d1c64 100644
--- a/healthtools/scrapers/health_facilities_scraper.py
+++ b/healthtools/scrapers/health_facilities_scraper.py
@@ -1,12 +1,30 @@
-from healthtools.scrapers.base_scraper import Scraper
-from healthtools.config import SITES, AWS
-from datetime import datetime
-import requests
 import json
+from healthtools.config import AWS
+import requests
 import boto3
-import scraperwiki
-scraperwiki.config = { db: 'data.sqlite', default_table_name: 'data' }
 
+health_facilities_template = """
+    {"type": "add",
+     "id":   "%s",
+     "fields": {
+              "name": "%s",
+              "facility_type_name": "%s",
+              "approved": "%s",
+              "sub_county_name": "%s",
+              "service_names": "%s",
+              "county_name": "%s",
+              "open_public_holidays": "%s",
+              "keph_level_name": "%s",
+              "open_whole_day": "%s",
+              "owner_name": "%s",
+              "constituency_name": "%s",
+              "regulatory_body_name": "%s",
+              "operation_status_name": "%s",
+              "open_late_night": "%s",
+              "open_weekends": "%s",
+              "ward_name": "%s"
+            }
+     }"""
 TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/'
 SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \
              'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \
@@ -23,15 +41,9 @@
              'keph_level,sub_county,town,regulation_status,contacts&format=json'
 
 
-class HealthFacilitiesScraper(Scraper):
+class HealthFacilitiesScraper(object):
     def __init__(self):
         self.access_token = None
-        self.fields = [
-            "name", "facility_type_name", "approved", "sub_county_name",
-            "service_names", "county_name", "open_public_holidays",
-            "keph_level_name", "open_whole_day", "owner_name",
-            "constituency_name", "regulatory_body_name", "operation_status_name", "open_late_night", "open_weekends", "ward_name"
-        ]
         self.cloudsearch = boto3.client(
             "cloudsearchdomain", **{
                 "aws_access_key_id": AWS["aws_access_key_id"],
@@ -51,7 +63,6 @@ def get_token(self):
         }
         r = requests.post(TOKEN_URL, data=data, headers=headers)
         self.access_token = json.loads(r.text)['access_token']
-        print self.access_token
 
     def get_data(self):
         try:
@@ -61,23 +72,22 @@ def get_data(self):
             print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason)
             payload = ''
             for i, record in enumerate(data['results']):
-                scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
                 payload += self.index_for_cloudsearch(record) + ','
                 #Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch
                 if i % 100 == 0 or i == (len(data['results']) - 1):
                     payload = '[%s]' % payload[:-1] #remove last comma
-                    print i
+                    # print i
                     self.push_to_cloud_search(payload)
                     payload = ''
         except Exception, err:
-            print "ERROR - index_for_search() - %s" % (err)
+            print "ERROR IN - index_for_search() - %s" % (err)
 
     def index_for_cloudsearch(self, record):
-        return index_template.health_facilities_template  % (
+        return health_facilities_template  % (
             record['code'],
             record['name'].replace("\"","'"),
             record['facility_type_name'],
-            record['service_names'],
+            record['approved'],
             record['sub_county_name'],
             record['service_names'],
             record['county_name'],
@@ -95,10 +105,10 @@ def index_for_cloudsearch(self, record):
 
     def push_to_cloud_search(self, payload):
         try:
-          response = self.cloudsearch.upload_documents(
-              documents=payload, contentType="application/json"
-          )
-        print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
+            response = self.cloudsearch.upload_documents(
+                documents=payload, contentType="application/json"
+            )
+            print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
         except Exception, err:
-          print "ERROR - index_for_search() - %s - %s" % (len(payload), err)
+            print "ERROR - index_for_search() - %s - %s" % (len(payload), err)
 

From 4c15ea8beeea46e7ecc59a54418e77eae928c13d Mon Sep 17 00:00:00 2001
From: mariam <mariam.abdussalam@andela.com>
Date: Wed, 17 May 2017 12:11:05 +0100
Subject: [PATCH 6/6] Add fix for endpoint-url

---
 healthtools/config.py                             | 2 +-
 healthtools/scrapers/health_facilities_scraper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/healthtools/config.py b/healthtools/config.py
index 456cc4b..ca87fe7 100644
--- a/healthtools/config.py
+++ b/healthtools/config.py
@@ -17,7 +17,7 @@
     # Clinical document endpoint
     "cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/",
     # Health facilities endpoint
-    "cloudsearch_health_faciities_endpoint":" https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com",
+    "cloudsearch_health_faciities_endpoint":"https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com",
 
 }
 
diff --git a/healthtools/scrapers/health_facilities_scraper.py b/healthtools/scrapers/health_facilities_scraper.py
index 92d1c64..431df87 100644
--- a/healthtools/scrapers/health_facilities_scraper.py
+++ b/healthtools/scrapers/health_facilities_scraper.py
@@ -49,7 +49,7 @@ def __init__(self):
                 "aws_access_key_id": AWS["aws_access_key_id"],
                 "aws_secret_access_key": AWS["aws_secret_access_key"],
                 "region_name": AWS["region_name"],
-                "endpoint_url": AWS["cloudsearch_doctors_endpoint"]
+                "endpoint_url": AWS["cloudsearch_health_faciities_endpoint"]
             })
 
     def get_token(self):