# Supreme Court Project 


## Step 1: Scraping

https://www.supremecourt.gov/oral_arguments/argument_transcript.aspx 

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [4]:
my_url = ('https://www.supremecourt.gov/oral_arguments/argument_transcript.aspx')
raw_html = urlopen(my_url).read()
soup_doc = BeautifulSoup(raw_html, "html.parser")

In [5]:
#for cases in 2016
my_table = soup_doc.find("table", attrs={"border": "1"})
cases = my_table.find_all('tr')

all_info = []
for case in cases:
    current = {}
    link = case.find_all('td')[0].find('a')
    case_name = case.find('span')
    case_date = case.find_all('td')[1].string
    docket_number = case.find_all('td')[0].find(target="_blank")
    if case_name:
        current['Script'] = link['href']
        current['Case Name'] = case_name.string
        current['Date Argued'] = case_date
        current['Docket Number'] = docket_number.string.strip()
        
        all_info.append(current)
        #print(current)
        #print("-----")
all_info

[{'Case Name': 'Perry v. Merit Systems Protection Bd.',
  'Date Argued': '04/17/17',
  'Docket Number': '16-399.',
  'Script': 'argument_transcripts/2016/16-399_3f14.pdf'},
 {'Case Name': 'Town of Chester v. Laroe Estates, Inc.',
  'Date Argued': '04/17/17',
  'Docket Number': '16-605.',
  'Script': 'argument_transcripts/2016/16-605_2dp3.pdf'},
 {'Case Name': "California Public Employees' Retirement System v. ANZ Securities, Inc.",
  'Date Argued': '04/17/17',
  'Docket Number': '16-373.',
  'Script': 'argument_transcripts/2016/16-373_4e46.pdf'},
 {'Case Name': 'Kokesh v. SEC',
  'Date Argued': '04/18/17',
  'Docket Number': '16-529.',
  'Script': 'argument_transcripts/2016/16-529_21p3.pdf'},
 {'Case Name': 'Henson v. Santander Consumer USA Inc.',
  'Date Argued': '04/18/17',
  'Docket Number': '16-349.',
  'Script': 'argument_transcripts/2016/16-349_e29g.pdf'},
 {'Case Name': 'Trinity Lutheran Church of Columbia, Inc. v. Comer',
  'Date Argued': '04/19/17',
  'Docket Number': '15-577.

In [6]:
len(all_info)

64

In [7]:
import pandas as pd
df = pd.DataFrame(all_info)

## Step 2: Starting Cleaning

In [8]:
df['PDF Filename'] = df['Script'].str.extract(r"(\d{2}-\d+_.{4})")
df['Docket Number'] = df['Docket Number'].str.replace(".","")
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Case Name,Date Argued,Docket Number,Script,PDF Filename
0,Perry v. Merit Systems Protection Bd.,04/17/17,16-399,argument_transcripts/2016/16-399_3f14.pdf,16-399_3f14
1,"Town of Chester v. Laroe Estates, Inc.",04/17/17,16-605,argument_transcripts/2016/16-605_2dp3.pdf,16-605_2dp3
2,California Public Employees' Retirement System...,04/17/17,16-373,argument_transcripts/2016/16-373_4e46.pdf,16-373_4e46
3,Kokesh v. SEC,04/18/17,16-529,argument_transcripts/2016/16-529_21p3.pdf,16-529_21p3
4,Henson v. Santander Consumer USA Inc.,04/18/17,16-349,argument_transcripts/2016/16-349_e29g.pdf,16-349_e29g


In [9]:
array = df['PDF Filename'].unique()
array.sort()

In [10]:
import re

In [11]:
f = open('/Volumes/Macintosh HD/Users/Elisa/Desktop/Supreme Court/pdfs/14-1055_h3dj.txt', 'r') #looping through replacing the numbers in 14-1055
sample_transcript = f.read()

In [12]:
clean_up = sample_transcript

In [13]:
#1. Alderson company stuff
clean_up_1 = re.sub("Alderson Reporting Company", "", clean_up, flags=re.IGNORECASE)
clean_up_1
clean_up_2 = re.sub("Official - Subject to Final Review", "", clean_up_1, flags=re.IGNORECASE)
clean_up_2
#2. Line numbers 1 - 25
clean_up_3 = re.sub("([0-9]|1[0-9]|2[0-5])", "", clean_up_2)
clean_up_3 
#3. get rid of \n\n\n  and time
clean_up_n = re.sub("(\n\n\x0c\n)", "", clean_up_3)
clean_up_n = re.sub("(\n\n.n)", "", clean_up_n)
clean_up_n = re.sub("(\n\n\n\n)", "", clean_up_n)
clean_up_n = re.sub("(\n\n)", "", clean_up_n)
clean_up_n = re.sub("(\n)", "", clean_up_n)
clean_up_n = re.sub("\\'", "'", clean_up_n)
clean_up_n = re.sub("\\'", "´", clean_up_n)
clean_up_time = re.sub(".: a.m..", "", clean_up_n)
clean_up_uppercase = re.sub("\w+ ARGUMENT[^a-z]+THE PETITIONERS", "", clean_up_time)
clean_up_uppercase_1 = re.sub("\w+ ARGUMENT[^a-z]+THE RESPONDENTS", "", clean_up_uppercase)
clean_up_uppercase_1
#4. Chop off the beginning before the dialogue begins
# --> working with split, find a place to split "Are you ready" "Thanks" 
clean_up = re.split("(PROCEEDINGS)", clean_up_time)
clean_up = clean_up[2]  #because it is a list, we can take the 3rd element [2]
#5. Chop off the end after the dialogue ends
clean_up = re.split("The case is submitted", clean_up) #it will always make a list !!!!
clean_up = clean_up[0] 

In [14]:
#Check your new variable to make sure it is clean
clean_up

'    CHIEF JUSTICE ROBERTS: We will hear  argument next in Case No. -, Lightfoot v. Cendant  Mortgage Corporation.  Mr. Rosenkranz.  ORAL ARGUMENT OF E. JOSHUA ROSENKRANZ  ON BEHALF OF THE PETITIONERS  MR. ROSENKRANZ: Thank you, Mr. Chief  Justice, and may it please the Court:  There is only one natural way to read the  language at issue here. A "court of competent  jurisdiction" is a court that has an independent source  of subject-matter jurisdiction. That is what this Court  has held five times those words mean. So let´s start  with the plain language.  The statute grants Freddie, quote, "The  power in its corporate name to sue and be sued in any  ´court of competent jurisdiction,´ State or Federal."  The only reference to jurisdiction in that passage is to  say that you don´t get to go to any Federal court or any  State court, but rather, you have to choose a court,  State or Federal, that must be a "court of competent  jurisdiction." And the only way to find out whether a  court i

### Get your dialogue list
Now this transcription should be clean enough to get a list with every speaker, and what the speaker said. The pattern for the speakers is fairly obvious--my recommendation is to do a split using groups (like the example I show above with "tomorrow and tomorrow").

If you write your regular expression correctly: you should get a single list in which each element is either a speaker, or what was said.

In [15]:
#get a list of speaker and speech
clean_up= re.split(r"([A-Z.\s]+:)", clean_up)
type(clean_up)

list

### Make it a list of pairs
If you got your list the way I recommended to, it is just single list with elements after element--you need to figure out how to change it so you pair the speaker with what is said. Give it some thought, there are a few ways to try to do this. If you made it this far, you're doing great!

In [16]:
#make it a list of pairs of speaker and speech
count = 0
text = []
for line in clean_up:
    count += 1
    if count %2 == 0:
        #print(line.strip('.  '))
        current = {'speaker': line.strip('.  '), 'speech': ''}
        print(current['speaker'])
    else: 
        current['speech'] = line
        print(current['speech'])
        text.append(current)    


CHIEF JUSTICE ROBERTS:
 We will hear  argument next in Case No. -, Lightfoot v. Cendant  Mortgage Corporation.  Mr. Rosenkranz
ORAL ARGUMENT OF E. JOSHUA ROSENKRANZ  ON BEHALF OF THE PETITIONERS  MR. ROSENKRANZ:
 Thank you, Mr. Chief  Justice, and may it please the Court:  There is only one natural way to read the  language at issue here. A "court of competent  jurisdiction" is a court that has an independent source  of subject-matter jurisdiction. That is what this Court  has held five times those words mean. So let´s start  with the plain language.  The statute grants Freddie, quote, "The  power in its corporate name to sue and be sued in any  ´court of competent jurisdiction,´ State or Federal."  The only reference to jurisdiction in that passage is to  say that you don´t get to go to any Federal court or any  State court, but rather, you have to choose a court,  State or Federal, that must be a "court of competent  jurisdiction." And the only way to find out whether a  court is a 

## Step 3: Loop through all texts

## Writing a function to loop through all texts

In [17]:
def my_function(sample_transcript):
    clean_up = sample_transcript
    clean_up_1 = re.sub("Alderson Reporting Company", "", clean_up, flags=re.IGNORECASE)
    clean_up_2 = re.sub("Official - Subject to Final Review", "", clean_up_1, flags=re.IGNORECASE)
    clean_up_3 = re.sub("([0-9]|1[0-9]|2[0-5])", "", clean_up_2)
    clean_up_n = re.sub("(\n\n\x0c\n)", "", clean_up_3)
    clean_up_n = re.sub("(\n\n.n)", "", clean_up_n)
    clean_up_n = re.sub("(\n\n\n\n)", "", clean_up_n)
    clean_up_n = re.sub("(\n\n)", "", clean_up_n)
    clean_up_n = re.sub("(\n)", "", clean_up_n)
    clean_up_n = re.sub("\\'", "'", clean_up_n)
    clean_up_time = re.sub(".: a.m..", "", clean_up_n)
    clean_up_uppercase = re.sub("\w+ ARGUMENT[^a-z]+THE PETITIONERS", "", clean_up_time)
    clean_up_uppercase_1 = re.sub("\w+ ARGUMENT[^a-z]+THE PETITIONER", "", clean_up_uppercase)
    clean_up_uppercase_2 = re.sub("\w+ ARGUMENT[^a-z]+THE RESPONDENTS", "", clean_up_uppercase_1)
    clean_up_uppercase_3 = re.sub("\w+ ARGUMENT [^a-z]+ (PETITIONER|RESPONDENT)S?", "", clean_up_uppercase_2)
    clean_up = re.split("(PROCEEDINGS)", clean_up_uppercase_3)
    clean_up = clean_up[2]  
    clean_up = re.split("above-entitled", clean_up) 
    clean_up = clean_up[0] 
    final = re.split(r"([A-Z][A-Z.\s]+):", clean_up)
    final.pop(0) #removing first element of the array
    married = list(zip(final[0::2], final[1::2])) #marrying every element and joining 
    return married

In [18]:
list_of_cases = []
path = '/Volumes/Macintosh HD/Users/Elisa/Desktop/Supreme Court/pdfs/'
for file_name in array:
    print(file_name)
    f = open(path + file_name + '.txt', 'rb')
    sample_transcript = f.read().decode('utf8', 'ignore')
    this_list = my_function(sample_transcript)   
    better_list = []
    for each in this_list:
        entry = list(each)
        entry.append(file_name)
        better_list.append(entry)
    this_list.append(file_name)
    list_of_cases.extend(better_list)

14-1055_h3dj
14-1538_j4ek
14-9496_feah
15-1031_6647
15-1039_bqm1
15-1111_ca7d
15-1189_6468
15-118_3e04
15-1191_igdj
15-1194_0861
15-1204_k536
15-1248_2dq3
15-1251_q86b
15-1256_d1o2
15-1262_l537
15-1293_o7jp
15-1358_7648
15-1391_5315
15-1406_d1of
15-1498_m647
15-1500_5g68
15-1503_3f14
15-214_l6hn
15-423_pnk0
15-457_gfbh
15-497_4g15
15-513_k5fm
15-537_ljgm
15-577_l64n
15-5991_21p3
15-606_5iel
15-628_p86a
15-649_l5gm
15-680_n648
15-7250_3eah
15-777_1b82
15-797_f2q3
15-8049_4f15
15-827_gfbh
15-8544_c1o2
15-866_j426
15-9260_bq7c
15-927_6j37
16-142_4gc5
16-149_bodg
16-240_nkp1
16-254_7lio
16-309_b97c
16-327_d18e
16-32_mlho
16-341_8njq
16-348_2cp3
16-349_e29g
16-369_8nka
16-373_4e46
16-399_3f14
16-405_9olb
16-466_4g15
16-5294_g314
16-529_21p3
16-54_7l48
16-605_2dp3
16-6219_7mio
16-74_p8k0


In [19]:
len(list_of_cases)

13566

In [20]:
import numpy as np
import pandas as pd
col_names = ['Speaker', 'Speech', 'PDF']
df_final = pd.DataFrame(list_of_cases, columns=col_names)
#!pwd = tells you where you are

In [21]:
#df_final.loc[5000:]

## Step 4: Additional Sources

In [22]:
#webpages with additional information
#http://scdb.wustl.edu/data.php
#scotusblog.com/case-files/terms
#https://en.wikipedia.org/wiki/Demographics_of_the_Supreme_Court_of_the_United_States
#https://www.supremecourt.gov/about/members_text.aspx

In [23]:
df_final.head()

Unnamed: 0,Speaker,Speech,PDF
0,CHIEF JUSTICE ROBERTS,"We will hear argument next in Case No. -, Li...",14-1055_h3dj
1,MR. ROSENKRANZ,"Thank you, Mr. Chief Justice, and may it ple...",14-1055_h3dj
2,JUSTICE GINSBURG,Does that include - you -- you said subject-m...,14-1055_h3dj
3,MR. ROSENKRANZ,I -- I am not limiting it to subject-matter ...,14-1055_h3dj
4,JUSTICE GINSBURG,What did you do -- what does Justice Souter's...,14-1055_h3dj


In [24]:
cases = df_final[['PDF']]
cases.drop_duplicates(subset='PDF', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
caseslist = []
def cases_in_list(row):
    casesdict = {}
    casesdict['cases_id'] = row['PDF']
    casesdict['cases_word_count'] = ''
    return caseslist.append(casesdict)
cases.apply(cases_in_list, axis=1)
caseslist

[{'cases_id': '14-1055_h3dj', 'cases_word_count': ''},
 {'cases_id': '14-1538_j4ek', 'cases_word_count': ''},
 {'cases_id': '14-9496_feah', 'cases_word_count': ''},
 {'cases_id': '15-1031_6647', 'cases_word_count': ''},
 {'cases_id': '15-1039_bqm1', 'cases_word_count': ''},
 {'cases_id': '15-1111_ca7d', 'cases_word_count': ''},
 {'cases_id': '15-1189_6468', 'cases_word_count': ''},
 {'cases_id': '15-118_3e04', 'cases_word_count': ''},
 {'cases_id': '15-1191_igdj', 'cases_word_count': ''},
 {'cases_id': '15-1194_0861', 'cases_word_count': ''},
 {'cases_id': '15-1204_k536', 'cases_word_count': ''},
 {'cases_id': '15-1248_2dq3', 'cases_word_count': ''},
 {'cases_id': '15-1251_q86b', 'cases_word_count': ''},
 {'cases_id': '15-1256_d1o2', 'cases_word_count': ''},
 {'cases_id': '15-1262_l537', 'cases_word_count': ''},
 {'cases_id': '15-1293_o7jp', 'cases_word_count': ''},
 {'cases_id': '15-1358_7648', 'cases_word_count': ''},
 {'cases_id': '15-1391_5315', 'cases_word_count': ''},
 {'cases_id

In [30]:
df_final['word_count'] = df_final['Speech'].apply(lambda x: len(x.split())) 
df_final.head()

Unnamed: 0,Speaker,Speech,PDF,word_count
0,CHIEF JUSTICE ROBERTS,"We will hear argument next in Case No. -, Li...",14-1055_h3dj,16
1,MR. ROSENKRANZ,"Thank you, Mr. Chief Justice, and may it ple...",14-1055_h3dj,156
2,JUSTICE GINSBURG,Does that include - you -- you said subject-m...,14-1055_h3dj,33
3,MR. ROSENKRANZ,I -- I am not limiting it to subject-matter ...,14-1055_h3dj,257
4,JUSTICE GINSBURG,What did you do -- what does Justice Souter's...,14-1055_h3dj,75


In [32]:
df_final_justice = df_final[df_final['Speaker'].str.contains('JUSTICE')]
df_final_justice.groupby(['Speaker', 'PDF'])['word_count'].sum().to_frame().reset_index(level=0).head()

Unnamed: 0_level_0,Speaker,word_count
PDF,Unnamed: 1_level_1,Unnamed: 2_level_1
15-1039_bqm1,A. JUSTICE BREYER,4
16-74_p8k0,A. JUSTICE KAGAN,32
15-1039_bqm1,A. JUSTICE KENNEDY,3
15-797_f2q3,AAMR . JUSTICE SOTOMAYOR,28
14-1055_h3dj,CHIEF JUSTICE ROBERTS,271


# Step 5: Introducing the Gender Perspective

In [33]:
listempty = []
def build_diction(row):
    dictempty = {}
    search_jus = re.findall('JUSTICE.*', str(row['Speaker']))
    if search_jus:
        dictempty['PDF'] = row['PDF']
        dictempty['Speaker'] = search_jus[0]
        dictempty['word_count'] = row['word_count']
        judges = ['JUSTICE ROBERTS', 'JUSTICE GINSBURG', 'JUSTICE BREYER', 'JUSTICE SOTOMAYOR','JUSTICE KENNEDY','JUSTICE KAGAN', 'JUSTICE ALITO', 'JUSTICE GORSUCH',' JUSTICE THOMAS']
        sexes = ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M']
        for judge in judges:
            if dictempty['Speaker'] == judge:
                dictempty['Gender'] = sexes[judges.index(judge)]
        listempty.append(dictempty)
    
df_final.apply(build_diction, axis=1)
listempty

[{'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE ROBERTS',
  'word_count': 16},
 {'Gender': 'F',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE GINSBURG',
  'word_count': 33},
 {'Gender': 'F',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE GINSBURG',
  'word_count': 75},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 60},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 42},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 13},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 1},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 86},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 4},
 {'Gender': 'M',
  'PDF': '14-1055_h3dj',
  'Speaker': 'JUSTICE BREYER',
  'word_count': 32},
 {'Gender': 'F',
  'PDF': '14-1055_h3dj',
  'Speaker': 'J

In [34]:
Genderlist = pd.DataFrame(listempty)

In [35]:
newlist = Genderlist.groupby(['PDF', 'Speaker', 'Gender']).agg({'word_count':sum}).reset_index()
newlist['Text'] = newlist['Speaker'] + ' is saying ' + newlist['word_count'].map(str) + ' words during the hearing. '
newlist.head()

Unnamed: 0,PDF,Speaker,Gender,word_count,Text
0,14-1055_h3dj,JUSTICE BREYER,M,1061,JUSTICE BREYER is saying 1061 words during the...
1,14-1055_h3dj,JUSTICE GINSBURG,F,557,JUSTICE GINSBURG is saying 557 words during th...
2,14-1055_h3dj,JUSTICE KAGAN,F,10,JUSTICE KAGAN is saying 10 words during the he...
3,14-1055_h3dj,JUSTICE KENNEDY,M,13,JUSTICE KENNEDY is saying 13 words during the ...
4,14-1055_h3dj,JUSTICE ROBERTS,M,271,JUSTICE ROBERTS is saying 271 words during the...


In [36]:
df_justicecount = newlist.groupby('PDF')['Text'].apply(lambda x: '<p>%s</p>'%'</p><p>'.join(x)).reset_index(name='article')

In [37]:
df_gendercounts = newlist.groupby(['PDF', 'Gender'])['word_count'].sum().reset_index()

# Step 6: Calculating Proportions

In [38]:
df_ready = df_gendercounts.groupby('PDF')['word_count'].sum().reset_index(name = 'sum')
df_merge = df_ready.merge(df_gendercounts,left_on='PDF', right_on='PDF')
df_merge['proportion'] = df_merge['word_count']/df_merge['sum'] *100

In [39]:
df_merge.head(8)

Unnamed: 0,PDF,sum,Gender,word_count,proportion
0,14-1055_h3dj,2269,F,924,40.722785
1,14-1055_h3dj,2269,M,1345,59.277215
2,14-1538_j4ek,2780,F,871,31.330935
3,14-1538_j4ek,2780,M,1909,68.669065
4,14-9496_feah,4172,F,2064,49.472675
5,14-9496_feah,4172,M,2108,50.527325
6,15-1031_6647,3063,F,1129,36.859288
7,15-1031_6647,3063,M,1934,63.140712


In [41]:
listofdictionaries = []

def buildanewlist(row):
    emptydictionary = {}
    emptydictionary['case_id'] = row['PDF']
    emptydictionary['sum'] = row['sum']
    emptydictionary['Gender'] = row['Gender']
    emptydictionary['word_count'] = row['word_count']
    emptydictionary['proportion'] = row['proportion']
    listofdictionaries.append(emptydictionary)
    
df_merge.apply(buildanewlist, axis=1)
listofdictionaries

[{'Gender': 'F',
  'case_id': '14-1055_h3dj',
  'proportion': 40.72278536800353,
  'sum': 2269,
  'word_count': 924},
 {'Gender': 'M',
  'case_id': '14-1055_h3dj',
  'proportion': 59.27721463199648,
  'sum': 2269,
  'word_count': 1345},
 {'Gender': 'F',
  'case_id': '14-1538_j4ek',
  'proportion': 31.330935251798557,
  'sum': 2780,
  'word_count': 871},
 {'Gender': 'M',
  'case_id': '14-1538_j4ek',
  'proportion': 68.66906474820144,
  'sum': 2780,
  'word_count': 1909},
 {'Gender': 'F',
  'case_id': '14-9496_feah',
  'proportion': 49.47267497603068,
  'sum': 4172,
  'word_count': 2064},
 {'Gender': 'M',
  'case_id': '14-9496_feah',
  'proportion': 50.52732502396932,
  'sum': 4172,
  'word_count': 2108},
 {'Gender': 'F',
  'case_id': '15-1031_6647',
  'proportion': 36.85928827946457,
  'sum': 3063,
  'word_count': 1129},
 {'Gender': 'M',
  'case_id': '15-1031_6647',
  'proportion': 63.14071172053543,
  'sum': 3063,
  'word_count': 1934},
 {'Gender': 'F',
  'case_id': '15-1039_bqm1',
  '

In [42]:
len(listofdictionaries)

128

# Step 7: Building a list with dictionaries to get cases as rows

In [43]:
uniquelist = []

i = -1
for something in listofdictionaries:
    i += 1
    anewdictionary = {}
    try:
        if listofdictionaries[i]['case_id'] == listofdictionaries[i+1]['case_id']:
            anewdictionary['case_id'] = listofdictionaries[i]['case_id'].split('_')[0]
            anewdictionary['word_count_sum'] = listofdictionaries[i]['sum']
            if listofdictionaries[i]['Gender'] == 'M':
                anewdictionary['word_count_M'] = listofdictionaries[i]['word_count']
            else:
                anewdictionary['word_count_M'] = listofdictionaries[i+1]['word_count']
            if listofdictionaries[i]['Gender'] == 'F':
                anewdictionary['word_count_F'] = listofdictionaries[i]['word_count']
            else:
                anewdictionary['word_count_F'] = listofdictionaries[i+1]['word_count']
            anewdictionary['proportions_F'] = round((anewdictionary['word_count_F'] / anewdictionary['word_count_sum']) * 100, 2)
            anewdictionary['proportions_M'] = round((anewdictionary['word_count_M'] / anewdictionary['word_count_sum']) * 100, 2)
            uniquelist.append(anewdictionary)
    except:
        pass
    
uniquelist

[{'case_id': '14-1055',
  'proportions_F': 40.72,
  'proportions_M': 59.28,
  'word_count_F': 924,
  'word_count_M': 1345,
  'word_count_sum': 2269},
 {'case_id': '14-1538',
  'proportions_F': 31.33,
  'proportions_M': 68.67,
  'word_count_F': 871,
  'word_count_M': 1909,
  'word_count_sum': 2780},
 {'case_id': '14-9496',
  'proportions_F': 49.47,
  'proportions_M': 50.53,
  'word_count_F': 2064,
  'word_count_M': 2108,
  'word_count_sum': 4172},
 {'case_id': '15-1031',
  'proportions_F': 36.86,
  'proportions_M': 63.14,
  'word_count_F': 1129,
  'word_count_M': 1934,
  'word_count_sum': 3063},
 {'case_id': '15-1039',
  'proportions_F': 40.12,
  'proportions_M': 59.88,
  'word_count_F': 1491,
  'word_count_M': 2225,
  'word_count_sum': 3716},
 {'case_id': '15-1111',
  'proportions_F': 60.39,
  'proportions_M': 39.61,
  'word_count_F': 1836,
  'word_count_M': 1204,
  'word_count_sum': 3040},
 {'case_id': '15-1189',
  'proportions_F': 11.52,
  'proportions_M': 88.48,
  'word_count_F': 21

In [37]:
len(uniquelist)

64

# Step 8: Final Dataframe

In [44]:
ready_to_geo = pd.DataFrame(uniquelist)
ready_to_geo.head()

Unnamed: 0,case_id,proportions_F,proportions_M,word_count_F,word_count_M,word_count_sum
0,14-1055,40.72,59.28,924,1345,2269
1,14-1538,31.33,68.67,871,1909,2780
2,14-9496,49.47,50.53,2064,2108,4172
3,15-1031,36.86,63.14,1129,1934,3063
4,15-1039,40.12,59.88,1491,2225,3716


In [45]:
geometry = pd.read_csv('LonLan - Supreme Court State Information.csv')
geometry.head()

Unnamed: 0,Docket Number,State,Date Argued,Date Decided,Area*,Decision,Status,Court Leaning,Previous Court,City,Latitude,Longitude
0,14-1055,California,06/28/16,01/18/17,Economic Activity,8-0,affirmed,Major Majority,9th Circuit,"Pasadena, California",34.147785,-118.144516
1,14-1538,Wisconsin,06/27/16,02/22/17,Economic Activity,7-0,reversed,Major Majority,District Court for the Western District of Wis...,"Madison, Wisconsin",43.073052,-89.40123
2,14-9496,Illinois,01/15/16,03/21/17,Civil Rights,6-2,affirmed,Major Majority,7th Circuit,"Joilet, Illinois",41.525031,-88.081725
3,15-118,Texas,10/11/16,06/26/17,Immigration,,vacated,Vacated,5th Circuit,"San Antonio, Texas",29.424122,-98.493628
4,15-214,Wisconsin,01/15/16,06/23/17,Economic Activity,5-3,affirmed,Liberal,Wisconsin Court Of Appeals District III,"St. Croix County, Wisconsin",45.049784,-92.387569


In [46]:
geometry = geometry[['Docket Number', 'Latitude', 'Longitude', 'State', 'Area*', 'Status', 'Court Leaning', 'City', 'Decision']]
geometry.head()

Unnamed: 0,Docket Number,Latitude,Longitude,State,Area*,Status,Court Leaning,City,Decision
0,14-1055,34.147785,-118.144516,California,Economic Activity,affirmed,Major Majority,"Pasadena, California",8-0
1,14-1538,43.073052,-89.40123,Wisconsin,Economic Activity,reversed,Major Majority,"Madison, Wisconsin",7-0
2,14-9496,41.525031,-88.081725,Illinois,Civil Rights,affirmed,Major Majority,"Joilet, Illinois",6-2
3,15-118,29.424122,-98.493628,Texas,Immigration,vacated,Vacated,"San Antonio, Texas",
4,15-214,45.049784,-92.387569,Wisconsin,Economic Activity,affirmed,Liberal,"St. Croix County, Wisconsin",5-3


In [47]:
df_ready = ready_to_geo.merge(geometry, left_on='case_id', right_on='Docket Number')
df_ready.head()

Unnamed: 0,case_id,proportions_F,proportions_M,word_count_F,word_count_M,word_count_sum,Docket Number,Latitude,Longitude,State,Area*,Status,Court Leaning,City,Decision
0,14-1055,40.72,59.28,924,1345,2269,14-1055,34.147785,-118.144516,California,Economic Activity,affirmed,Major Majority,"Pasadena, California",8-0
1,14-1538,31.33,68.67,871,1909,2780,14-1538,43.073052,-89.40123,Wisconsin,Economic Activity,reversed,Major Majority,"Madison, Wisconsin",7-0
2,14-9496,49.47,50.53,2064,2108,4172,14-9496,41.525031,-88.081725,Illinois,Civil Rights,affirmed,Major Majority,"Joilet, Illinois",6-2
3,15-1031,36.86,63.14,1129,1934,3063,15-1031,33.453821,-112.069434,Arizona,Economic Activity,Reversed and Remanded,Major Majority,"Pheonix, Arizona",8-0
4,15-1039,40.12,59.88,1491,2225,3716,15-1039,34.170561,-118.837594,"Washington, D.C.",Economic Activity,"Vacated in part, Reversed in part and Remanded",Major Majority,"Thousand Oaks, California",8-0


# Step 9: Geojson

In [48]:
import geopandas as gpd
from shapely.geometry import Point

In [49]:
def make_point(row):
    return Point(row.Longitude, row.Latitude)

df_ready['geometry'] = df_ready.apply(make_point, axis=1)

In [50]:
df_ready.head()

Unnamed: 0,case_id,proportions_F,proportions_M,word_count_F,word_count_M,word_count_sum,Docket Number,Latitude,Longitude,State,Area*,Status,Court Leaning,City,Decision,geometry
0,14-1055,40.72,59.28,924,1345,2269,14-1055,34.147785,-118.144516,California,Economic Activity,affirmed,Major Majority,"Pasadena, California",8-0,POINT (-118.144516 34.147785)
1,14-1538,31.33,68.67,871,1909,2780,14-1538,43.073052,-89.40123,Wisconsin,Economic Activity,reversed,Major Majority,"Madison, Wisconsin",7-0,POINT (-89.40123 43.073052)
2,14-9496,49.47,50.53,2064,2108,4172,14-9496,41.525031,-88.081725,Illinois,Civil Rights,affirmed,Major Majority,"Joilet, Illinois",6-2,POINT (-88.08172500000001 41.525031)
3,15-1031,36.86,63.14,1129,1934,3063,15-1031,33.453821,-112.069434,Arizona,Economic Activity,Reversed and Remanded,Major Majority,"Pheonix, Arizona",8-0,POINT (-112.069434 33.453821)
4,15-1039,40.12,59.88,1491,2225,3716,15-1039,34.170561,-118.837594,"Washington, D.C.",Economic Activity,"Vacated in part, Reversed in part and Remanded",Major Majority,"Thousand Oaks, California",8-0,POINT (-118.837594 34.170561)


In [51]:
df_ready = gpd.GeoDataFrame(df_ready)

In [61]:
df_ready.to_file('Supreme_Court_ready.json', driver='GeoJSON')