# Supreme Court Project



# Step 1 - Scraping and creating a dictionary

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
my_url = ('https://www.supremecourt.gov/oral_arguments/argument_transcript.aspx')
raw_html = urlopen(my_url).read()
soup_doc = BeautifulSoup(raw_html, "html.parser")

In [17]:
#scraping for cases in 2016
my_table = soup_doc.find("table", attrs={"border": "1"})
cases = my_table.find_all('tr')

all_info = []
for case in cases:
    current = {}
    link = case.find_all('td')[0].find('a')
    case_name = case.find('span')
    case_date = case.find_all('td')[1].string
    docket_number = case.find_all('td')[0].find(target="_blank")
    if case_name:
        current['Script'] = link['href']
        current['Case Name'] = case_name.string
        current['Date Argued'] = case_date
        current['Docket Number'] = docket_number.string.strip()
        
        all_info.append(current)
        #print(current)
        #print("-----")
#all_info

# Step 2: Cleaning up the data

In [5]:
import pandas as pd
df = pd.DataFrame(all_info)
df['PDF Filename'] = df['Script'].str.extract(r"(\d{2}-\d+_.{4})")
df['Docket Number'] = df['Docket Number'].str.replace(".","")
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Case Name,Date Argued,Docket Number,Script,PDF Filename
0,Perry v. Merit Systems Protection Bd.,04/17/17,16-399,argument_transcripts/2016/16-399_3f14.pdf,16-399_3f14
1,"Town of Chester v. Laroe Estates, Inc.",04/17/17,16-605,argument_transcripts/2016/16-605_2dp3.pdf,16-605_2dp3
2,California Public Employees' Retirement System...,04/17/17,16-373,argument_transcripts/2016/16-373_4e46.pdf,16-373_4e46
3,Kokesh v. SEC,04/18/17,16-529,argument_transcripts/2016/16-529_21p3.pdf,16-529_21p3
4,Henson v. Santander Consumer USA Inc.,04/18/17,16-349,argument_transcripts/2016/16-349_e29g.pdf,16-349_e29g


In [6]:
array = df['PDF Filename'].unique()
array.sort()

In [9]:
import re
f = open('/Volumes/Macintosh HD/Users/Elisa/Desktop/Supreme Court/pdfs/14-1055_h3dj.txt', 'r') #looping through replacing the numbers in 14-1055
sample_transcript = f.read()

In [18]:
#sample_transcript

In [11]:
clean_up = sample_transcript

In [12]:
def my_function(sample_transcript):
    clean_up = sample_transcript
    clean_up_1 = re.sub("Alderson Reporting Company", "", clean_up, flags=re.IGNORECASE)
    clean_up_2 = re.sub("Official - Subject to Final Review", "", clean_up_1, flags=re.IGNORECASE)
    clean_up_3 = re.sub("([0-9]|1[0-9]|2[0-5])", "", clean_up_2)
    clean_up_n = re.sub("(\n\n\x0c\n)", "", clean_up_3)
    clean_up_n = re.sub("(\n\n.n)", "", clean_up_n)
    clean_up_n = re.sub("(\n\n\n\n)", "", clean_up_n)
    clean_up_n = re.sub("(\n\n)", "", clean_up_n)
    clean_up_n = re.sub("(\n)", "", clean_up_n)
    clean_up_n = re.sub("\\'", "'", clean_up_n)
    clean_up_time = re.sub(".: a.m..", "", clean_up_n)
    clean_up_uppercase = re.sub("\w+ ARGUMENT[^a-z]+THE PETITIONERS", "", clean_up_time)
    clean_up_uppercase_1 = re.sub("\w+ ARGUMENT[^a-z]+THE PETITIONER", "", clean_up_uppercase)
    clean_up_uppercase_2 = re.sub("\w+ ARGUMENT[^a-z]+THE RESPONDENTS", "", clean_up_uppercase_1)
    clean_up_uppercase_3 = re.sub("\w+ ARGUMENT [^a-z]+ (PETITIONER|RESPONDENT)S?", "", clean_up_uppercase_2)
    clean_up = re.split("(PROCEEDINGS)", clean_up_uppercase_3)
    clean_up = clean_up[2]  
    clean_up = re.split("above-entitled", clean_up) 
    clean_up = clean_up[0] 
    final = re.split(r"([A-Z][A-Z.\s]+):", clean_up)
    final.pop(0) #removing first element of the array
    married = list(zip(final[0::2], final[1::2])) #marrying every element and joining 
    return married

# Step 3: Including all cases

In [13]:
list_of_cases = []
path = '/Volumes/Macintosh HD/Users/Elisa/Desktop/Supreme Court/pdfs/'
for file_name in array:
    #print(file_name)
    if file_name != '15-1358_7648' and file_name != '15-577_l64n' and file_name != '15-866_j426' and file_name != '16-32_mlho' and file_name!= '16-466_4g15' and file_name !='16-529_21p3':
        f = open(path + file_name + '.txt', 'r')
        sample_transcript = f.read()
        this_list = my_function(sample_transcript)   
        better_list = []
        for each in this_list:
            entry = list(each)
            entry.append(file_name)
            better_list.append(entry)
        this_list.append(file_name)
        list_of_cases.extend(better_list)  

In [15]:
import numpy as np
col_names = ['Speaker', 'Speech', 'PDF']
df_final = pd.DataFrame(list_of_cases, columns=col_names)
df_final.head()

Unnamed: 0,Speaker,Speech,PDF
0,CHIEF JUSTICE ROBERTS,"We will hear argument next in Case No. -, Li...",14-1055_h3dj
1,MR. ROSENKRANZ,"Thank you, Mr. Chief Justice, and may it ple...",14-1055_h3dj
2,JUSTICE GINSBURG,Does that include - you -- you said subject-m...,14-1055_h3dj
3,MR. ROSENKRANZ,I -- I am not limiting it to subject-matter ...,14-1055_h3dj
4,JUSTICE GINSBURG,What did you do -- what does Justice Souter's...,14-1055_h3dj


# Step 4: Merging the two dataframes

In [16]:
combined_df = df_final.merge(df, left_on='PDF', right_on='PDF Filename')
combined_df.head()

Unnamed: 0,Speaker,Speech,PDF,Case Name,Date Argued,Docket Number,Script,PDF Filename
0,CHIEF JUSTICE ROBERTS,"We will hear argument next in Case No. -, Li...",14-1055_h3dj,Lightfoot v. Cendant Mortgage,11/08/16,14-1055,argument_transcripts/2016/14-1055_h3dj.pdf,14-1055_h3dj
1,MR. ROSENKRANZ,"Thank you, Mr. Chief Justice, and may it ple...",14-1055_h3dj,Lightfoot v. Cendant Mortgage,11/08/16,14-1055,argument_transcripts/2016/14-1055_h3dj.pdf,14-1055_h3dj
2,JUSTICE GINSBURG,Does that include - you -- you said subject-m...,14-1055_h3dj,Lightfoot v. Cendant Mortgage,11/08/16,14-1055,argument_transcripts/2016/14-1055_h3dj.pdf,14-1055_h3dj
3,MR. ROSENKRANZ,I -- I am not limiting it to subject-matter ...,14-1055_h3dj,Lightfoot v. Cendant Mortgage,11/08/16,14-1055,argument_transcripts/2016/14-1055_h3dj.pdf,14-1055_h3dj
4,JUSTICE GINSBURG,What did you do -- what does Justice Souter's...,14-1055_h3dj,Lightfoot v. Cendant Mortgage,11/08/16,14-1055,argument_transcripts/2016/14-1055_h3dj.pdf,14-1055_h3dj
