# Importing Web Data with Python

Table of Contents
* [Flat files](#flat)
* [Excel Files](#excel)
* [HTML](#html)
* [APIs & JSON](#api)

## <a name="flat"><a/>Flat Files from the Web

In [None]:
# Import pandas
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

# Read file into a DataFrame and print its head
df = pd.read_csv(url, sep=';')

print(df.head())

## <a name="excel"><a/>Excel files from the Web

In [None]:
# Import package
import pandas as pd

# Assign url of file: url
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'

# Read in all sheets of Excel file: xls
xls = pd.read_excel(url, sheet_name = None)  # Need to pass None to get all sheets

# Print the sheetnames to the shell
print(xls.keys())  # importing excel sheets creates a dictionary with sheets as keys

# Print the head of the first sheet (using its name, NOT its index)
print(xls['1700'].head())

## <a name="html"><a/>HTML

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Grab HTML
url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text

# Parse HTML
soup = BeautifulSoup(html_doc)

# Extract Info
guido_title = soup.title  # title
guido_text = soup.text  # text

# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')

# Print the URLs to the shell
for link in a_tags:
    print(link.get('href'))

# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()

## <a name="api"><a/>APIs & JSON

In [None]:
# Import requests package
import requests

# Assign URL to variable: url
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=the+social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print each key-value pair in json_data
for k in json_data.keys():
    print(k + ': ', json_data[k])
    
# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)