# Web Scraping

In [None]:
# What you've seen before
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

# What's new
from bs4 import BeautifulSoup # We'll be using this to scrape through HTML documents
import re # We'll be using regular expressions (a bit) to search through text
import networkx as nx # We'll be using this to graph a website (not super important)

## Example 1: Hacker News

In [None]:
html_doc = requests.get('https://news.ycombinator.com/').text
html_doc

### Basic Document Information

In [None]:
soup = BeautifulSoup(html_doc)
print(soup.title) # Find the 'title' tag
print(soup.title.string) # Find the 'title' tag's text value

In [None]:
# We can get all the text from elements in the HTML
# to get an easy dataset for the webpage's text content
print(soup.get_text())

In [None]:
print soup.a # This gives us the first link ('a') tag in the document

# We can also iterate over all the link tags in a document
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
# Suppose we know the class of an element we want to search for.
# We can pass that in as a parameter to narrow our search
for story_link in soup.find_all('a', {'class': 'storylink'}):
    print(story_link.text, story_link.get('href'))

In [None]:
stories = []
for news_row in soup.find_all('tr', {'class': 'athing'}):
    # We can use the same commands on children nodes of the document
    # (.find() returns the first found result)
    story = news_row.find('a', {'class': 'storylink'})
    story_name, story_link = story.text, story.get('href')

    # We're not restricted just to searching up and down the document,
    # We can search accross 'siblings'
    meta_data_row = news_row.find_next_sibling('tr')
    
    score, comments = None, None
    
    # Need to check if the story has a score
    if meta_data_row.find('span', {'class': 'score'}):
        score = int(meta_data_row.find('span', {'class': 'score'}).text.split()[0])
    
    # If you know regular expressions, you can use those to search for patterns
    # in the document
    comments_pattern = re.compile(r'comments')
    if meta_data_row.find('a', text=comments_pattern):
        comments = int(meta_data_row.find('a', text=comments_pattern).text.split()[0])
    
    stories.append({
            'name': story_name,
            'link': story_link,
            'score': score,
            'comments': comments
        })

In [None]:
stories_df = pd.DataFrame(stories)
stories_df.plot.scatter('comments', 'score')

## Example 2: Crawling the Web

In [None]:
nyt_url = 'http://www.nytimes.com/'
nyt_links = {} # Dict to hold adjacent sites
links_to_scrape = [nyt_url] # Stack of links to visit

# Keep going while we still have links to visit, and we have
# 'seen' less than 1500 (unique) links
while links_to_scrape and len(nyt_links) < 1000:
    # Grab the 'latest' link from our list
    scrape_page = links_to_scrape.pop()
    # Visit our link and make it into soup
    soup = BeautifulSoup(requests.get(scrape_page).text)
    
    # Iterate through all the links on the current page
    for link in soup.find_all('a'):
        href = link.get('href') # Get the link's address
        
        # Reject the link if invalid or doesn't start with 'http'
        if not href or not href.startswith('http'):
            continue
            
        # Have we already visited this link?
        if not href in nyt_links:
            nyt_links[href] = [scrape_page] # Initialize list of adjacent links
            
            # Save link to scrape later
            if href not in links_to_scrape:
                links_to_scrape.append(href)
        else:
            nyt_links[href].append(scrape_page) # Add link to adjacent links
            
    # Report how many links we've 'seen'
    print("Up to {} links.".format(len(nyt_links)))

print nyt_links

In [None]:
# Plotting the graph (not super important to know in detail)

# Create a graph and feed it in our pages as nodes
nyt_graph = nx.Graph()
nyt_graph.add_nodes_from(nyt_links.keys())

# Add edges based on the 'adjacent' lists we created earlier
for node, adjacent in nyt_links.items():
    for a in adjacent:
        nyt_graph.add_edge(node, a)

with sns.axes_style('white'):
    fig = plt.subplots(1, figsize=(12,10))
    nx.draw_networkx(nyt_graph, edge_color='#a4a4a4', with_labels=False,
                     node_size=map(lambda x: len(x) * 10, nyt_links.values()))
    plt.axis('off')

# Challenges
#### These may take you a bit longer! 😉

## Challenge 1: Subreddit Scraper
Write a function that, given the name of a subreddit, will return you a DataFrame with the same data that we scraped from the HackerNews news page. (Story name / link, Score, and # of comments).

(i.e. `subreddit_scraper('dataisbeautiful')` would be the function call)

**NOTE**: You may want to use this in your request: `headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}`

In [None]:
def subreddit_scraper(sub_str):
    pass

subreddit_scraper('dataisbeautiful')

## Challenge 2: PyPI Parser
*(PyPI is the indexer of popular python packages)*

Write a function that, given the URL of a Python package, will return you a dictionary with the following package information:
- Author
- Release (upload) Date
- License
- Home Page

(Note the possibility that not all of these fields will be present on all packages, and write your function accordingly)

In [None]:
def pypi_parser(pypi_url):
    pass

pypi_parser('https://pypi.python.org/pypi/beautifulsoup4/4.5.1')
pypi_parser('https://pypi.python.org/pypi/SQLAlchemy/1.1.0')

## Challenge 3: IMDB Cast Data
Write a function that, given the URL of a movie on IMDB will return you a list of the cast of the movie with the following information:
- Actor name
- Character name
- Actor birthday

Example:
```
>>> imdb_cast('http://www.imdb.com/title/tt0796366/')
[
  {'actor': 'Chris Pine',
  'character': 'Kirk',
  'birthday': 'August 26, 1980'
  }
  ...
]
```


In [None]:
def imdb_cast(movie_url):
    pass
imdb_cast('http://www.imdb.com/title/tt0796366/')


<h2><center>That's it!</center></h2>
![nice_job](http://i.giphy.com/eoxomXXVL2S0E.gif)


### Bonus Points:
1. For problem 1, write another function that uses the output of your function to plot the # of comments against the score of the post. Also, include a linear trend line. (Hint: `numpy.polyfit`)
2. For problem 2, include an array of all the published versions of the package in your return dictionary.

*Important:* When you send us your solutions, let us know which (if any)
of the bonus problems you've attempted.