# Web Crawling Models

In [41]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

In [61]:
class Content():
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        print(f'URL: {self.url}')
        print(f'TITLE: {self.title}')
        print(f'BODY: {self.body}')
        
class Website():
    def __init__(self, name, url, title_tag, body_tag):
        self.name = name
        self.url = url
        self.title_tag = title_tag
        self.body_tag = body_tag
        
class Crawler():
    def get_page(self, url):
        try:
            req = requests.get(url)
        except RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safe_get(self, page_obj, selector):
        selected_elems = page_obj.select(selector)
        if selected_elems is not None and len(selected_elems) > 0:
            return '\n'.join(
                [elem.get_text() for elem in selected_elems])
        return ''
    
    def parse(self, site, url):
        bs = self.get_page(url)
        if bs is not None:
            title = self.safe_get(bs, site.title_tag)
            body = self.safe_get(bs, site.body_tag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()
      
    
        
def get_page(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def scrape_nytimes(url):
    bs = get_page(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

In [62]:
url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'

content = scrape_nytimes(url)
content.print()

URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html
TITLE: The Men Who Want to Live Forever
BODY: Would you like to live forever? Some billionaires, already invincible in every other way, have decided that they also deserve not to die. Today several biotech companies, fueled by Silicon Valley fortunes, are devoted to “life extension” — or as some put it, to solving “the problem of death.”
It’s a cause championed by the tech billionaire Peter Thiel, the TED Talk darling Aubrey de Gray, Google’s billion-dollar Calico longevity lab and investment by Amazon’s Jeff Bezos. The National Academy of Medicine, an independent group, recently dedicated funding to “end aging forever.”
As the longevity entrepreneur Arram Sabeti told The New Yorker: “The proposition that we can live forever is obvious. It doesn’t violate the laws of physics, so we can achieve it.” Of all the slightly creepy aspects to this trend, the strangest is the least noticed: The people public

In [84]:
crawler = Crawler()
site_data = [
    ["O\'Reilly Media", "http://orielly.com",
     'h1', 'div span div']]

websites = []

for row in site_data:
    websites.append(Website(row[0], row[1], row[2], row[3]))
    
crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')

URL: http://shop.oreilly.com/product/0636920028154.do
TITLE: Learning Python, 5th Edition
BODY: Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages.Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.Explore Python’s major built-in object types such as numbers, lists, and dictionariesCreate and process objects with Python statements, and learn Python’s general syntax modelUse functions to avoid 

In [65]:
url = 'http://shop.oreilly.com/product/0636920028154.do'

bs = BeautifulSoup(requests.get(url).text)