# Webscraping with Python

In [None]:
# This code was taken from https://github.com/julia-git/webscraping_ny_mta/blob/master/Webscraping.ipynb
# This code downloads turnstile data from http://web.mta.info/developers/turnstile.html

# Turnstile data has been compiled by New York MTA every week since May 2010 to the present
# So hundreds of .txt files exist on this site. 
# After visiting the site, you can see it would be a giant pain in the butt to download each of these links manually

# This is where code from Python can shine - all of these datasets can be downloaded automatically
# Be careful of legal issues and issues with the API however. 
# Downloading data too rapidly may cause you to be blocked from the site. 

In [None]:
# We start by importing the necessary packages
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

### Set the URL you want to webscrape from

In [None]:
# Next we set the url to the website
url = 'http://web.mta.info/developers/turnstile.html'

### Connect to the URL

In [None]:
# Then we attempt to access the site with our requests library
response = requests.get(url)

In [None]:
response #200 means it went through

### Parse HTML and save to BeautifulSoup object

In [None]:
# Next we clean the data a bit with the BeautifulSoup library create a nicer data structure.
# BeautifulSoup has good documentation if you are interested

soup = BeautifulSoup(response.text, "html.parser")

In [None]:
soup

### To locate all  'a' tags

In [None]:
# Since an <a> tag is used to link to the datasets, we can use that to find the locations of
# of all of the links on the MTA web page. 
soup.findAll('a')

### Let's take a quick look at the very first data file, which starts on line 36

In [None]:
one_a_tag = soup.findAll('a')[36]

### We want to extract the actual link

In [None]:
link = one_a_tag['href']
link

# This code saves the link to turnstyle data as our variable  link. 

### The full download URL is 'http://web.mta.info/developers/' + link

## To download the whole data set, let's do a for loop through all a tags

In [None]:
# You'll also notice that we put time.sleep() in this code. This helps us stay on the 
# API's good side. 
line_count = 1
for one_a_tag in soup.findAll('a'): #'a' tags are for links
    if line_count >= 36 and line_count <= 38: #code for text files starts at line 36 and I'm stopping it at 50 because I don't actually want all this data
        link = one_a_tag['href']
        download_url = 'http://web.mta.info/developers/'+ link
        urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:]) 
        time.sleep(1)
    line_count +=1