## Trauma Hospital Data

This is data that I scraped from the American College of Surgeons
https://www.facs.org/search/trauma-centers?country=United%20States&n=250

In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess


class ESSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ESS"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.facs.org/search/trauma-centers?country=United%20States&n=250',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for each in response.xpath('//div[@class="searchResults"]/ul'):
            
            # Yield a dictionary with the values we want.
            yield {
                # This is the code to choose what we want to extract
                # You can modify this with other Xpath expressions to extract other information from the site
                'hospital': each.xpath('li/h3/text()').extract_first(),
                'address': each.xpath('li/text()').extract()[2],
                'level': each.xpath('li[last()]/text()').extract_first()
            }
            
        # Get the URL of the next page.
        next_page = response.xpath('//a[@id="content_element_0_main_column_1_FullPagination_Next"]/@href').extract_first()        
        
        # Recursively call the spider to run on the next page, if it exists.
        if next_page is not None:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(next_page, callback=self.parse)
            
            
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'trauma_data.json',  # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(ESSpider)
process.start()

## Hospital Fall Data

This is data that I scraped from the profiles of thousands of hospitals, medical clinics, nursing homes and home health centers. http://www.hospital-data.com/

In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess
import itertools

class ESSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ESS"
    
    # URL(s) to start with.
    start_urls = [
        'http://www.hospital-data.com/index.html',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        state_urls = []
        
        # Iterate over every state element on the page.
        for each in response.xpath('//div[@id="hospitals"]/ul[@class="tab-list tab-list-long"]'):
            # Append all the state url to a list
            state_urls.append(each.xpath('li/a/@href').extract())
            
        # Make the state urls into one list
        state_urls = list(itertools.chain(*state_urls))
        
        each_state_url = []
        # Get the entire url link for each state
        for each in state_urls:
            next_page = response.urljoin(each)
            each_state_url.append(''+next_page)
        # Call back each state's url into a new scraper. 
        for each in each_state_url:
            request = scrapy.Request(each, callback=self.state_parse_info)
            yield request

    def state_parse_info(self, response):
        
        hospital_urls = []
        # Loop over each medical facility in the state and make sure that it has one of these words
        words_list = ['HOSPITAL', 'MEDICAL CENTER', 'HEALTH SYSTEM', 'UNIVERSITY']
        for each in response.xpath('//tbody/tr/td'):
            for one in each.xpath('a'):
                if any(word in one.xpath('text()').extract_first() for word in words_list):
                    # Append all the hospital urls to a list
                    hospital_urls.append(one.xpath('@href').extract_first())
          
        each_hospital_url = []
        # Get the entire url link for each hospital
        for each in hospital_urls:
            next_page = response.urljoin(each)
            each_hospital_url.append(''+next_page)
        # Call back each hospital's url into a new scraper. 
        for each in each_hospital_url:
            request = scrapy.Request(each, callback=self.hospital_parse_info)
            yield request
            
    def hospital_parse_info(self, response):
        # Iterate over every 'hgraph' element on the page.
        for each in response.xpath('//div[@class="hgraph"]'):
            for item in each.xpath('b/text()').extract():
                # Only select the hospitals that have fall/injury data
                if item == 'Falls and injuries':
                    # Yield a dictionary with the values we want.
                    yield {
                        # This is the code to choose what we want to extract
                        # Extracting the hospital name, hospital fall rate, and state fall rate. 
                        'hospital': response.xpath('//div[@class="container-fluid"]//h1[@align="center"]/text()').extract_first(),
                        'hospital_falls': each.xpath('table//td/text()').extract()[0],
                        'state_falls': each.xpath('table//td/text()').extract()[1]
                    }
            
            
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({ 'AUTOTHROTTLE_ENABLED': True, 
                          'HTTPCACHE_ENABLED': True, 
                          'ROBOTSTXT_OBEY': True,
                          'DOWNLOAD_DELAY': 1,
                          'USER_AGENT': 'Carley (clfletch91@gmail.com)',
                          'FEED_FORMAT': 'json',         # Store data in JSON format.
                          'FEED_URI': 'hospital_data.json',  # Name our storage file.
                          'LOG_ENABLED': False})

# Start the crawler with our spider.
process.crawl(ESSpider)
process.start()
print("Success")