Skip to content

Commit

Permalink
Merge pull request #4 from andela-mabdussalam/master
Browse files Browse the repository at this point in the history
Health Facilities Scraper
  • Loading branch information
DavidLemayian committed May 17, 2017
2 parents 8997cba + 4c15ea8 commit 57390ca
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 3 deletions.
4 changes: 4 additions & 0 deletions healthtools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"DOCTORS": "http://medicalboard.co.ke/online-services/retention/?currpage={}",
"FOREIGN_DOCTORS": "http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage={}",
"CLINICAL_OFFICERS": "http://clinicalofficerscouncil.org/online-services/retention/?currpage={}",
"TOKEN_URL" : "http://api.kmhfl.health.go.ke/o/token/"
}

AWS = {
Expand All @@ -15,6 +16,9 @@
"cloudsearch_doctors_endpoint": "http://doc-cfa-healthtools-ke-doctors-m34xee6byjmzcgzmovevkjpffy.eu-west-1.cloudsearch.amazonaws.com/",
# Clinical document endpoint
"cloudsearch_cos_endpoint": "http://doc-cfa-healthtools-ke-cos-nhxtw3w5goufkzram4er7sciz4.eu-west-1.cloudsearch.amazonaws.com/",
# Health facilities endpoint
"cloudsearch_health_faciities_endpoint":"https://doc-health-facilities-ke-65ftd7ksxazyatw5fiv5uyaiqi.eu-west-1.cloudsearch.amazonaws.com",

}

TEST_DIR = os.getcwd() + "/healthtools/tests"
114 changes: 114 additions & 0 deletions healthtools/scrapers/health_facilities_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import json
from healthtools.config import AWS
import requests
import boto3

health_facilities_template = """
{"type": "add",
"id": "%s",
"fields": {
"name": "%s",
"facility_type_name": "%s",
"approved": "%s",
"sub_county_name": "%s",
"service_names": "%s",
"county_name": "%s",
"open_public_holidays": "%s",
"keph_level_name": "%s",
"open_whole_day": "%s",
"owner_name": "%s",
"constituency_name": "%s",
"regulatory_body_name": "%s",
"operation_status_name": "%s",
"open_late_night": "%s",
"open_weekends": "%s",
"ward_name": "%s"
}
}"""
TOKEN_URL = 'http://api.kmhfl.health.go.ke/o/token/'
SEARCH_URL = 'http://api.kmhfl.health.go.ke/api/facilities/material/?page_size=10000&' \
'fields=id,regulatory_status_name,facility_type_name,facility_type_parent,owner_name,owner_type_name,' \
'owner_type,operation_status_name,county,constituency,constituency_name,ward_name,average_rating,' \
'facility_services,is_approved,has_edits,latest_update,regulatory_body_name,owner,date_requested,' \
'date_approved,latest_approval_or_rejection,sub_county_name,sub_county_id,county_name,constituency_id,' \
'county_id,keph_level_name,facility_contacts,coordinates,lat_long,latest_approval,county_code,constituency_code' \
',ward_code,service_catalogue_active,facility_units,officer_in_charge,created,updated,deleted,active,search,' \
'name,official_name,code,registration_number,abbreviation,description,number_of_beds,number_of_cots,' \
'open_whole_day,open_public_holidays,open_normal_day,open_weekends,open_late_night,is_classified,' \
'is_published,regulated,approved,rejected,bank_name,branch_name,bank_account,facility_catchment_population,' \
'town_name,nearest_landmark,plot_number,location_desc,closed,closed_date,closing_reason,date_established,' \
'license_number,created_by,updated_by,facility_type,operation_status,ward,parent,regulatory_body,' \
'keph_level,sub_county,town,regulation_status,contacts&format=json'


class HealthFacilitiesScraper(object):
def __init__(self):
self.access_token = None
self.cloudsearch = boto3.client(
"cloudsearchdomain", **{
"aws_access_key_id": AWS["aws_access_key_id"],
"aws_secret_access_key": AWS["aws_secret_access_key"],
"region_name": AWS["region_name"],
"endpoint_url": AWS["cloudsearch_health_faciities_endpoint"]
})

def get_token(self):
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
data = {
'username': 'public@mfltest.slade360.co.ke',
'password': 'public',
'grant_type': 'password',
'client_id': 'xMddOofHI0jOKboVxdoKAXWKpkEQAP0TuloGpfj5',
'client_secret': 'PHrUzCRFm9558DGa6Fh1hEvSCh3C9Lijfq8sbCMZhZqmANYV5ZP04mUXGJdsrZLXuZG4VCmvjShdKHwU6IRmPQld5LDzvJoguEP8AAXGJhrqfLnmtFXU3x2FO1nWLxUx'
}
r = requests.post(TOKEN_URL, data=data, headers=headers)
self.access_token = json.loads(r.text)['access_token']

def get_data(self):
try:
headers = {'Authorization': 'Bearer ' + self.access_token}
r = requests.get(SEARCH_URL, headers=headers)
data = r.json()
print "DEBUG - get_data() - %s - %s" % (len(data['results']), r.reason)
payload = ''
for i, record in enumerate(data['results']):
payload += self.index_for_cloudsearch(record) + ','
#Every 100th entry push to cloudsearch or if we have reached the end push to cloudsearch
if i % 100 == 0 or i == (len(data['results']) - 1):
payload = '[%s]' % payload[:-1] #remove last comma
# print i
self.push_to_cloud_search(payload)
payload = ''
except Exception, err:
print "ERROR IN - index_for_search() - %s" % (err)

def index_for_cloudsearch(self, record):
return health_facilities_template % (
record['code'],
record['name'].replace("\"","'"),
record['facility_type_name'],
record['approved'],
record['sub_county_name'],
record['service_names'],
record['county_name'],
record['open_public_holidays'],
record['keph_level_name'],
record['open_whole_day'],
record['owner_name'],
record['constituency_name'],
record['regulatory_body_name'],
record['operation_status_name'],
record['open_late_night'],
record['open_weekends'],
record['ward_name'].decode("string_escape").replace('\\',''),
)

def push_to_cloud_search(self, payload):
try:
response = self.cloudsearch.upload_documents(
documents=payload, contentType="application/json"
)
print "DEBUG - index_for_search() - %s - %s" % (len(payload), response.get("status"))
except Exception, err:
print "ERROR - index_for_search() - %s - %s" % (len(payload), err)

8 changes: 5 additions & 3 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from healthtools.scrapers.doctors import DoctorsScraper
from healthtools.scrapers.foreign_doctors import ForeignDoctorsScraper
from healthtools.scrapers.clinical_officers import ClinicalOfficersScraper

from healthtools.scrapers.health_facilities_scraper import HealthFacilitiesScraper
if __name__ == "__main__":
healthfacilities_scraper = HealthFacilitiesScraper()
doctors_scraper = DoctorsScraper()
foreign_doctors_scraper = ForeignDoctorsScraper()
clinical_officers_scraper= ClinicalOfficersScraper()

clinical_officers_scraper = ClinicalOfficersScraper()
healthfacilities_scraper.get_token()
healthfacilities_scraper.get_data()
# scraping you softly with these bots...
doctors_result = doctors_scraper.scrape_site()
if doctors_result:
Expand Down

0 comments on commit 57390ca

Please sign in to comment.