## WebMD Spider

Author: Amirsadra Mohseni

This is the first phase of our WebMD spider.
In this project, we use Python's scrapy library to crawl https://www.webmd.com/drugs/2/index and extract the name and usage of all the drugs present. The code is ran using a Jupyter notebook or Google Colab or any other similar notebook interpreter. We export our crawled findings to a csv file.

### Sources referred:
https://www.webmd.com/  
https://github.com/jasonjjchen/Web_Scraping-WebMD_Scrapy  
https://stackoverflow.com/questions/51354279/xpath-taking-text-with-hyperlinks-python  
https://docs.scrapy.org/en/latest/topics/  
https://www.mikulskibartosz.name/how-to-scrape-a-single-web-page-using-scrapy-in-jupyter-notebook/  

In [1]:
import scrapy
import logging
import string
from scrapy import Spider, Request
from scrapy import Item, Field
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import urllib
import re

In [5]:
class WebmdItem(Item):
    Drug = Field()
    Condition = Field()
    Use = Field()
    HowtoUse = Field()
    GenName = Field()
    BrandName = Field()

In [3]:
class WebmdSpider(scrapy.Spider):
    name = 'webmd_spider'
    allowed_urls = ['http://www.webmd.com/']
    start_urls = ['https://www.webmd.com/drugs/2/index']
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_FORMAT':'csv',
        'FEED_URI': 'webmd_test.csv'
    }
    
    
    def parse(self, res):
        # Get the href of all conditions a through z from 
        # https://www.webmd.com/drugs/2/index
        drugs_a_to_z_url = res.xpath('//*[@class="drugs-browse-box"]/ul/li/a/@href').extract()
        
        # Iterate over all href
        #for url in drugs_a_to_z_url:
        #    print(res.urljoin(url))
        yield Request(res.urljoin(drugs_a_to_z_url[0]), callback = self.parse_subgroups)
    
    
    def parse_subgroups(self, res):
        drugs_subletter_url = res.xpath('//*[@class="drugs-browse-subbox"]/ul/li/a/@href').extract()
    
        #print(drugs_subletter_url)
    
        for url in drugs_subletter_url:
            yield Request(res.urljoin(url), callback = self.parse_drug)
    
    
    def parse_drug(self, res):
        drug_url = res.xpath('//*[@class="drug-list-container"]/ul/li/a/@href').extract()
        
        for url in drug_url:
            yield Request(res.urljoin(url), callback = self.parse_use)
    
    
    def parse_use(self, res):
        Use = ''
        HowtoUse = ''
        Condition = ''
        Drug = ''
        BrandName = ''
        GenName = ''
        
        # Some webmd pages look different. This checks if a certain class that only exists on some pages is present
        # For example,
        # https://www.webmd.com/drugs/2/drug-63164/adderall-xr-oral/details 
        # and
        # https://www.webmd.com/drugs/2/drug-7277/percocet-oral/details
        # are different page types.
        monograph = res.xpath('//div[@class="monograph-page"]')
        
        # Different page types require different xpaths to select Uses
        if monograph:
            Use = ''.join(res.xpath('//*[@id="app"]/main/div[3]/div[2]/div/div[2]/div[1]/p//text()').extract())
            HowtoUse = ''.join(res.xpath('//*[@id="app"]/main/div[3]/div[2]/div/div[2]/div[2]//p//text()').extract())
            Drug = ''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h1/text()').extract()).strip()
            
            # Sometimes the common brand appear first.
            # Example: https://www.webmd.com/drugs/2/drug-941/acyclovir-oral/details
            if (''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h3[1]/text()').extract()).strip().lower().find('common brand(s)')) != -1:
                BrandName = ''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h3[1]/text()').extract())
                BrandName = BrandName.strip().lower().replace('common brand(s): ', '').title()
                
                GenName = ''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h3[2]/text()').extract())
                GenName = GenName.strip().lower().replace('generic name(s): ', '').title()
                
            # Sometimes there is no brand name. Instead there is only generic name.
            # Example: https://www.webmd.com/drugs/2/drug-9130/dilaudid-oral/details
            elif (''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h3[1]/text()').extract()).strip().lower().find('generic name(s)')) != -1:
                BrandName = ''
                
                GenName = ''.join(res.xpath('//*[@id="app"]/main/div[2]/div/div/div/header/h3[1]/text()').extract())
                GenName = GenName.strip().lower().replace('generic name(s): ', '').title()

        # For the other page type - Same operations as above, only different xpaths
        else:
            Use = ''.join(res.xpath('//*[@id="tab-1"]/div[1]/div/p//text()').extract())
            HowtoUse = ''.join(res.xpath('//*[@id="tab-1"]/div[1]/div/div[1]//p//text()').extract())
            Drug = ''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div/h1/text()').extract()).strip()
            
            #Condition = ''.join(res.xpath('//*[@id="ContentPane30"]/div[5]/div[1]/ul/li[1]/a/text()').extract()).strip()
            
            for li in res.xpath('//*[@id="ContentPane30"]/div[5]/div[1]/ul/li'):
                Condition += Selector(text=li.extract()).xpath('//text()').extract()[0] + ', '
            
            Condition = Condition.strip(', ')
            
            # Sometimes the common brand appear first.
            if (''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div[1]/p[1]/text()').extract()).strip().lower().find('common brand(s)')) != -1:
                BrandName = ''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div[1]/p[1]/text()').extract())
                BrandName = BrandName.strip().lower().replace('common brand(s): ', '').title()
                    
                GenName = ''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div[1]/p[2]/text()').extract())
                GenName = GenName.strip().lower().replace('generic name(s): ', '').title()
            
            # Sometimes there is no brand name. Instead there is only generic name.
            elif (''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div[1]/p[1]/text()').extract()).strip().lower().find('generic name(s)')) != -1:
                BrandName = ''
                
                GenName = ''.join(res.xpath('//*[@id="ContentPane29"]/div[1]/div[1]/div[1]/p[1]/text()').extract())
                GenName = GenName.strip().lower().replace('generic name(s): ', '').title()

        item = WebmdItem()
        item['Drug'] = Drug
        item['Use'] = Use
        item['Condition'] = Condition
        item['HowtoUse'] = HowtoUse
        item['GenName'] = GenName
        item['BrandName'] = BrandName
        
        yield item

In [4]:
process = CrawlerProcess()
process.crawl(WebmdSpider)
process.start()

2020-12-29 16:16:50 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-12-29 16:16:50 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.20.0, Twisted 19.10.0, Python 3.8.1 | packaged by conda-forge | (default, Jan 29 2020, 14:24:10) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Windows-10-10.0.18362-SP0
2020-12-29 16:16:50 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'csv', 'FEED_URI': 'webmd_test.csv', 'LOG_LEVEL': 30}
