In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sys import argv
from collections import defaultdict
from time import sleep
import os
import boto3
from io import StringIO, BytesIO
import re
from time import sleep
from urlparse import urljoin
import unicodedata
import matplotlib.pyplot as plt
%matplotlib inline



In [10]:
def get_page_urls(soup): 
    """
    Get all the search result, case urls from a single page.
    :param soup: Beautiful soup object
    :return: url_list, the desired urls to be checked
    """
    url_list = []
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        if 'circuit' in link.get('href'): 
            url_list.append(link.get('href'))
    return url_list

        
def get_next_url(current_url, soup):
    """
    Get the URL of the next listings page.
    :param soup: Beautiful soup object
    :return: string, the URL of the next webpage
    """
    
    d = [ i.get('href') for i in soup.find_all(name="a", attrs={"class": "pgnum"}) ][-1]
    return urljoin(current_url, d)

        
        
def create_soup(url):
    """
    Get the HTML contents of the URL.
    :param url: string, the url to scrape
    :return: soup: a BeautifulSoup object
    """
    page = requests.get(url)
    return BeautifulSoup(page.text, "html.parser")


def second_check(url):
    
    content = requests.get(url).content
    soup = BeautifulSoup(content, "html.parser")
    
    for i in soup.find_all("p"): 
#         print i, type(i)
        try: 
            href= i.a.get('href')
            return  href
            break
    
        except:
            continue 
            
def first_check(url_list): 
    ## Get case text from second layer link if in initial check, otherwise check a second time
    ## for an additional internal link in second check.
    ## Return confirmed links with case texts: 

    target_urls = []
    
    for site in url_list:
        new_soup = create_soup(site)

        for html in new_soup.find_all(name="div", attrs={"class": "btn_read"}): 
            check_url= html.a.get('href')        
            check_soup = create_soup(check_url)
            
            if check_soup.find_all("a", { "target" : "new"}): 
#                 print "NEEDED A SECOND CHECK"
                target_urls.append(second_check(check_url))
#                 print "BUT ADDED: ", second_check(check_url), "On the Second Check"

            else: 
                target_urls.append(check_url)
#                 print 'ON THE FIRST TRY, ADDED: ', check_url
        sleep(2)
    return target_urls


In [20]:

def run_scraper(current_url, dft):
    """
    Run the web scraper that will scrape Findlaw
    :param current_url: string, the initial URL to scrape
    :param dft: Pandas dataframe, the listings data from previous scrapings
    :return: dft: Pandas dataframe containing scraped information
    """

    flag = u'\xbb'
    soup = create_soup(current_url)
    cases = defaultdict(list)

    # Run the scraper until it runs out of pages to scrape

    while u'\xbb' in flag:
        # 1.) Get each court case listing

        for row in soup.find_all(name="tr", attrs={"class": "srpcaselawtr"})[1:3]:
            cases = add_case_info(row, cases)
            sleep(2)

        ## 2.) Get next page's pagination info + set new flag
        current_url = get_next_url(current_url, soup)
        soup = create_soup(current_url)
        next_list = []
        for i in soup.find_all(name="a", attrs={"class": "pgnum"}):
            next_list.append(i.text)
        flag = next_list[-1]
        break
    dft = dft.append(pd.DataFrame(cases), ignore_index=True)
    
    return dft


# df_new = pd.DataFrame(columns=["job_title", "location", "company",
#                                    "url", "jobsite", "job_description"])

 

In [147]:
url = 'http://caselaw.findlaw.com/summary/search/?query=filters&court=us-1st-circuit&dateFormat=yyyyMMdd&topic=cs_42&pgnum=1'
content = requests.get(url).content
soup = BeautifulSoup(content, "html.parser")
cases = defaultdict(list)


for ind, row in enumerate(soup.find_all(name="tr", attrs={"class": "srpcaselawtr"})):
#     listings = add_listing_info(div, listings)
#     print get_case_url(row) #row.find('a').get('href')
#     sleep(1)
#     print ind, get_title(row)
#     print ind, row.find_all('td', {'valign':'top'})[-1].text
#     if ind == 1: 
#         print add_case_info(row, cases)
    pass

    
    


In [12]:
def add_case_info(row, case_dict):
    """
    Get the results of scraping a single case.
    :param :
    :param :
    :return:
    (1) Type of Law, (Case, Statutory, Executive) 
    (2) Title of Court Case
    (3) Court 
    (4) Tags / types of law
    (5) Date
    (6) Docket 
    (7) Case URL 
    (8) Case Text 
    (9) WebSource (Findlaw, etc.) 
    
    """
    case_dict["date"] += [get_date(row)]
    case_dict["court"] += [get_court(row)]
    case_dict["case_title"] += [get_case_title(row)]
    case_dict["docket"] += [get_docket(row)]
    case_dict["tags"] += [get_tags(row)]
    case_dict["type_of_law"] += ["case"]
    case_dict["web_source"] += ["findlaw.com"]

    case_link = get_case_url(row)
    
    case_dict["url"] += [case_link]
    case_dict["case_text"] += [get_case_text(case_link)]
    
    return case_dict

In [13]:
### Get case information
""" 
(1) Type of Law, (Case, Statutory, Executive) 
(2) Title of Court Case
(3) Court 
(4) Tags / types of law
(5) Date
(6) Docket 
(7) Case URL 
(8) Case Text 
(9) WebSource (Findlaw, etc.) 
"""

def get_case_title(row): 
    """
    extract the title from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, the case title
    """
    title = row.find('a').get("title")
    return title

def get_court(row): 
    """
    extract the court name from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, the court
    """
    court= row.find('span').text
    return court 

def get_tags(row): 
    """
    extract the types of law involved as law tags from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, the law tags 
    """
    tags = row.find('i').text
    return tags

def get_date(row):
    """
    extract decision date from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, decision date 
    """
    decision_date = row.find_all('td', {'valign':'top'})[-2].text
    return decision_date

def get_docket(row):
    """
    extract docket number from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, docket number
    """
    docket_number = row.find_all('td', {'valign':'top'})[-1].text
    return docket_number

def second_check(url):
    soup= create_soup(url)
    for i in soup.find_all("p"): 
        try: 
            href= i.a.get('href')
            return  href
            break
    
        except:
            continue 

def get_case_url(row):
    """
    extract case URL from the row tag 
    :row: tag object, the row tag from a case 
    :return: string, the URL of case text
    """
    initial_url = row.find('a').get('href')
    new_soup = create_soup(initial_url)
    check_url = new_soup.find(name="div", attrs={"class": "btn_read"}).find('a').get('href')
    check_soup = create_soup(check_url)
    
    if check_soup.find_all("a", { "target" : "new"}): 
        case_url= second_check(check_url)
    else: 
        case_url= check_url
        
    return case_url

def get_case_text(link):
    """
    Get the raw text of the case from the linked webpage.
    :param link: str, the url of case text webpage
    :return: if case text as html: str, the case text from the webpage
            else: nothing. 
    """
    text_soup = create_soup(link)

    # remove all javascript and stylesheet code
    for script in text_soup(["script", "style"]): 
        script.extract()

    text = text_soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())

    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    # drop any blank lines and get only relevant text
    case=[]

    for chunk in chunks: 
        if chunk and 'United States Court of Appeals' in chunk: 
            case.append(chunk)
            for chunk in chunks:
                if 'FindLaw Career Center' in chunk:
                    break 
                else: 
                    case.append(chunk)
    return '\n'.join(case)



def create_df_new():
    """
    If it doesn't exist, create the initial case_data file
    :return: New Dataframe
    """
    df_new = pd.DataFrame(columns=["date", "court", "case_title",
                                   "docket", "tags", "type_of_law",
                                   "web_source", "url", "case_text"])
    return df_new


def write_file_to_s3(df_write):
    """
    Save the updated dataframe to a file on the project's AWS S3 bucket.
    :param df_write: DataFrame to write to file
    :return: None
    """
    for column in df:
        df_write[column] = df_write[column].str.encode('utf-8')

    try:
        csv_buffer = StringIO()
        df_write.to_csv(csv_buffer, index=False)
    except:
        csv_buffer = BytesIO()
        df_write.to_csv(csv_buffer, index=False)

    s3 = boto3.resource("s3", aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
                        aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])
    s3.Object("court-case-data", "court_cases_data_test.csv").put(Body=csv_buffer.getvalue())


def access_s3_to_df():
    """
    Access the project's S3 bucket and load the file into a dataframe for appending.
    :return: df: a pandas dataframe containing the data.
    """

    s3 = boto3.client("s3", aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
                      aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])
    try:
        obj = s3.get_object(Bucket="court-case-data", Key="court_cases_data_test.csv")
        return pd.read_csv(BytesIO(obj["Body"].read()))
    except:
        return create_df_new()





In [78]:
link = 'http://caselaw.findlaw.com/us-1st-circuit/1873444.html'

text_soup = create_soup(link)

# remove all javascript and stylesheet code
for script in text_soup(["script", "style"]): 
    script.extract()

text = text_soup.get_text()

# print text

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# print lines


# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

# drop any blank lines

case=[]

for chunk in chunks: 
    if 'United States Court of Appeals' in chunk: 
        print chunk
        for chunk in chunks:
            if 'FindLaw Career Center' in chunk:
                break 
            else: 
                case.append(chunk)
print '\n'.join(case)
    
     
            
# if 'Abstract' in line:                
#                 for line in f
# print '\n'.join(chunk for chunk in chunks if chunk and 'United States Court of Appeals' in chunk)


# while 'United States Court' not in 
    
    
# get_case_text(url)




United States Court of Appeals, First Circuit.
JUAN RAMIREZ MATIAS, Petitioner, v. JEFFERSON B. SESSIONS III, Attorney General, Respondent.
No. 16-2474
Decided: September 08, 2017

Before Thompson, Kayatta, and Barron, Circuit Judges.Jason Panzarino and The Law Office of Johanna Herrero on brief for petitioner. Allison Frayer, Trial Attorney, Office of Immigration Litigation, Civil Division, Chad A. Readler, Acting Assistant Attorney General, Civil Division, and Jessica A. Dawgert, Senior Litigation Counsel, Office of Immigration Litigation, Civil Division, on brief for respondent.
Petitioner Juan Ramirez Matias (“Ramirez”) challenges the Board of Immigration Appeals's (“BIA”) denial of his motion to exercise its sua sponte authority to reopen his case and grant his request for cancellation of removal. We find that even if we have jurisdiction to consider his appeal, we must still deny Ramirez's petition.BACKGROUNDRamirez is no stranger to this court: in 2014, he petitioned for review 

## Loading into AWS S3 

In [26]:


df = access_s3_to_df()
first_url = 'http://caselaw.findlaw.com/summary/search/?query=filters&court=us-1st-circuit&dateFormat=yyyyMMdd&topic=cs_42&pgnum=1'
df = run_scraper(first_url, df) 
    
print df

# write_file_to_s3(df)

                                           case_text  \
0                                                      
1  United States Court of Appeals, First Circuit....   
2  United States Court of Appeals, First Circuit....   
3  United States Court of Appeals, First Circuit....   
4  United States Court of Appeals, First Circuit....   
5  United States Court of Appeals, First Circuit....   
6  United States Court of Appeals, First Circuit....   
7  United States Court of Appeals, First Circuit....   
8  United States Court of Appeals, First Circuit....   
9  United States Court of Appeals, First Circuit....   

                      case_title                        court        date  \
0         US v. Marte De La Cruz  United States First Circuit  12/04/2017   
1     Aguilar-Escoto v. Sessions  United States First Circuit  10/27/2017   
2             Matias v. Sessions  United States First Circuit  09/08/2017   
3              US v. Cueto-Nunez  United States First Circuit  08/25/2017  

### Testing out  Writing to file in S3 Bucket 

In [35]:
type(df['web_source'][21])

str

In [39]:
# df = access_s3_to_df()
# first_url = 'http://caselaw.findlaw.com/summary/search/?query=filters&court=us-1st-circuit&dateFormat=yyyyMMdd&topic=cs_42&pgnum=1'
# df= run_scraper(first_url, df)
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

# for column in df:
# #     print column
#     df[column] = df[column].str.encode('utf-8')



print write_file_to_s3(df)

## Reading it back from AWS - S3

In [41]:

# 's3' is a key word. create connection to S3 using default config and all buckets within S3
s3 = boto3.resource('s3')
client = boto3.client('s3') #low-level functional API

obj = client.get_object(Bucket='court-case-data', Key='court_cases_data_test.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))

df.tail()

Unnamed: 0,case_text,case_title,court,date,docket,tags,type_of_law,url,web_source
27,"United States Court of Appeals, First Circuit....",Sanchez Romero v. Sessions,United States First Circuit,07/26/2017,16-2416P.01A,Immigration Law,case,http://caselaw.findlaw.com/us-1st-circuit/1868...,findlaw.com
28,"United States Court of Appeals, First Circuit....",Coelho v. Sessions,United States First Circuit,07/24/2017,16-2220P.01A,"Criminal Law & Procedure, Immigration Law",case,http://caselaw.findlaw.com/us-1st-circuit/1868...,findlaw.com
29,"United States Court of Appeals, First Circuit....",Marroquin-Rivera v. Sessions,United States First Circuit,06/23/2017,16-1732,Immigration Law,case,http://caselaw.findlaw.com/us-1st-circuit/1865...,findlaw.com
30,"United States Court of Appeals, First Circuit....",Williams v. US,United States First Circuit,06/08/2017,16-2147,Immigration Law,case,http://caselaw.findlaw.com/us-1st-circuit/1863...,findlaw.com
31,"United States Court of Appeals, First Circuit....",Garcia-Cruz v. Sessions,United States First Circuit,05/26/2017,15-2272,"Immigration Law, Administrative Law",case,http://caselaw.findlaw.com/us-1st-circuit/1862...,findlaw.com


In [63]:
print df['case_text'][2]

United States Court of Appeals, First Circuit.
JUAN RAMIREZ MATIAS, Petitioner, v. JEFFERSON B. SESSIONS III, Attorney General, Respondent.
No. 16-2474
Decided: September 08, 2017

Before Thompson, Kayatta, and Barron, Circuit Judges.Jason Panzarino and The Law Office of Johanna Herrero on brief for petitioner. Allison Frayer, Trial Attorney, Office of Immigration Litigation, Civil Division, Chad A. Readler, Acting Assistant Attorney General, Civil Division, and Jessica A. Dawgert, Senior Litigation Counsel, Office of Immigration Litigation, Civil Division, on brief for respondent.
Petitioner Juan Ramirez Matias (“Ramirez”) challenges the Board of Immigration Appeals's (“BIA”) denial of his motion to exercise its sua sponte authority to reopen his case and grant his request for cancellation of removal. We find that even if we have jurisdiction to consider his appeal, we must still deny Ramirez's petition.BACKGROUNDRamirez is no stranger to this court: in 2014, he petitioned for review 

In [2]:
newdf= pd.read_csv('/Users/kevinmagana/Desktop/court_cases_data_test (3).csv')

In [7]:
newdf.head()

Unnamed: 0,case_text,case_title,court,date,docket,tags,type_of_law,url,web_source
0,,US v. Marte De La Cruz,United States First Circuit,12/04/2017,16-2152P.01A,"Immigration Law, Criminal Law & Procedure",case,http://media.ca1.uscourts.gov/pdf.opinions/16-...,findlaw.com
1,"United States Court of Appeals, First Circuit....",Aguilar-Escoto v. Sessions,United States First Circuit,10/27/2017,16-1090,Immigration Law,case,http://caselaw.findlaw.com/us-1st-circuit/1878...,findlaw.com
2,"United States Court of Appeals, First Circuit....",Matias v. Sessions,United States First Circuit,09/08/2017,16-2474P.01A,"Civil Rights, Civil Procedure, Immigration Law...",case,http://caselaw.findlaw.com/us-1st-circuit/1873...,findlaw.com
3,"United States Court of Appeals, First Circuit....",US v. Cueto-Nunez,United States First Circuit,08/25/2017,16-1700P.01A,"Criminal Law & Procedure, Sentencing, Immigrat...",case,http://caselaw.findlaw.com/us-1st-circuit/1872...,findlaw.com
4,"United States Court of Appeals, First Circuit....",De Lima v. Sessions,United States First Circuit,08/16/2017,15-2453,"Immigration Law, Criminal Law & Procedure",case,http://caselaw.findlaw.com/us-1st-circuit/1871...,findlaw.com
