# Web Scraping

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
% matplotlib inline

## Example 1: Hacker News

In [None]:
html_doc = requests.get('https://news.ycombinator.com/').text
html_doc

### Basic Document Information

In [None]:
soup = BeautifulSoup(html_doc)
print(soup.title) # Find the 'title' tag
print(soup.title.string) # Find the 'title' tag's text value

In [None]:
# We can get all the text from elements in the HTML
# to get an easy dataset for the webpage's text content
print(soup.get_text())

In [None]:
print soup.a # This gives us the first link ('a') tag in the document

# We can also iterate over all the link tags in a document
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
# Suppose we know the class of an element we want to search for.
# We can pass that in as a parameter to narrow our search
for story_link in soup.find_all('a', {'class': 'storylink'}):
    print(story_link.text, story_link.get('href'))

In [None]:
stories = []
for news_row in soup.find_all('tr', {'class': 'athing'}):
    # We can use the same commands on children nodes of the document
    # (.find() returns the first found result)
    story = news_row.find('a', {'class': 'storylink'})
    story_name, story_link = story.text, story.get('href')

    # We're not restricted just to searching up and down the document,
    # We can search accross 'siblings'
    meta_data_row = news_row.find_next_sibling('tr')
    
    score, comments = None, None
    
    # Need to check if the story has a score
    if meta_data_row.find('span', {'class': 'score'}):
        score = int(meta_data_row.find('span', {'class': 'score'}).text.split()[0])
    
    # If you know regular expressions, you can use those to search for patterns
    # in the document
    comments_pattern = re.compile(r'comments')
    if meta_data_row.find('a', text=comments_pattern):
        comments = int(meta_data_row.find('a', text=comments_pattern).text.split()[0])
    
    stories.append({
            'name': story_name,
            'link': story_link,
            'score': score,
            'comments': comments
        })

In [None]:
stories_df = pd.DataFrame(stories)
stories_df.plot.scatter('comments', 'score')

## Example 2: Crawling the Web

# Challenges
#### These may take you a bit longer! 😉

## Challenge 1:
Write a function that, given the name of a subreddit, will return you a DataFrame with the same data that we scraped from the HackerNews news page. (Story name / link, Score, and # of comments).

(i.e. `subreddit_scraper('dataisbeautiful')` would be the function call)

In [None]:
def subreddit_scraper(sub_str):
    pass

subreddit_scraper('dataisbeautiful')


<h2><center>That's it!</center></h2>
![nice_job](http://i.giphy.com/eoxomXXVL2S0E.gif)
