# Data Collection & Preparation


### Software Package & Built in Function Documentation
 - Beautiful Soup - https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 - requests - http://docs.python-requests.org/en/master/
 - Regular Expressions - https://docs.python.org/2/library/re.html
 - Pandas - https://pandas.pydata.org/pandas-docs/stable/10min.html
 - Numpy - http://www.numpy.org/
 - Pickle - https://docs.python.org/2/library/pickle.html 

In [1]:
# Import Scientific Packages into Python Kernel
from bs4 import BeautifulSoup 
import requests 
import re 

import pandas as pd
import numpy as np

# Set a user-agent in your header so you aren't flagged by the browser when making an HTTP request
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) \
           Chrome/39.0.2171.95 Safari/537.36'}


## Create a List of Supreme Court Ducuments and their URLs

In [2]:
# Link url from caselaw which has a repository of Supreme Court ruling opinions and assign to variable
root_url = "http://caselaw.findlaw.com/court/us-supreme-court/years/"

![Caselaw Search](../images/caselaw-page.png)

In [3]:
# Assign a variable to house an array of supreme court documents listed within the years you'd like to explore
years = [root_url + str(year) for year in range(1760,2018)]

# Define a method that executes your url request and returns the data (HTML or XML) as an Object 
def Beautiful_soup_grabber(link):
    
    response = requests.get(link, headers = headers) #optional add timeout (seconds) keeps requests from running indefinitely 
    
    return BeautifulSoup(response.text, "lxml") #Returns BeautifulSoup object, which represents the document as a nested data structure


# Define a method which calls the above method for each year within the range you've requested and convert result object into table
def year_getter(years):
    
    y = {}
    for year in years:
        soup = Beautiful_soup_grabber(year)
        souplist = soup.findAll("a")
        
        #use regular expressions to 
        for i in souplist:
            if re.search("us-supreme-court", str(i)) and not re.search("years", str(i)) and not re.search("/court/", str(i)):
                b = i["href"]
                y[b] = [re.sub("[^0-9]", "", b.split("/")[-1])]
    
    
    return pd.DataFrame(y).transpose().reset_index() #converts results to data frame table using pandas

In [None]:
df = year_getter(years) #call the above function and assign it to a shorthand varible (this will take several minutes to execute)

df.columns = ["case_url", "docket"] #assign column names

In [None]:

df.head(5) #Return the first n rows (default n=5) to check table values and header names aligned correctly

In [31]:

df.ix[0, "case_url"] #select a specific row in the dataframe to check value

'http://caselaw.findlaw.com/us-supreme-court/05-1101.html'

In [29]:

df.to_pickle("supcourt_yearlist.pickle") #Python Object serialization - “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream

In [32]:
df.shape #The shape attribute for numpy arrays returns the dimensions of the array

#this will return the number of cases in our table

(23393, 2)