# Get RI case data

In [314]:
import os
pdf_ri_path = os.path.dirname(os.getcwd()) + "/pdf_ri_criminal_cases"
save_path = os.path.dirname(os.getcwd()) + "/data"

## Extracting text from pdfs

In [315]:
import re
import pdftotext

pdf_title_case = []

for year in range(2008, 2018):
    path = pdf_ri_path + "/" + str(year) + " - " + str(year+1)
    
    # load case titles
    pdf_title = []
    with open(path + "/README.md","r") as pdf_list:
        for line in pdf_list:
            pdf_title.append(line.split('\t')[1:])
    
    # read file names in this year
    filenames = os.listdir(path)
            
    # load pdf files
    for filename in filenames:
        # excluding non-pdf files
        if filename.endswith('.pdf'):
            # get pdf object and extract text from pdf object
            with open(path + "/" + filename, "rb") as f:
                try:
                    pdf_obj = pdftotext.PDF(f)
                except:
                    print(filename + " is damaged")
                    pdf_obj=[]
            
            # remove page number
            pages = []
            for page in pdf_obj:
                if page:
                    line = page.rsplit("\n", 2)
                    if line[1].strip()[-1] == "-" and line[1].strip()[0] == "-":
                        pages.append(line[0] + "\n")
                    else:
                        pages.append(page)
            text = ''.join(pages)
            
            if text:
                # split a case into paragraphs
                paragraphs = re.split(r'\s{2,}', text)
                # find case title
                for i in pdf_title:
                    if filename in i:
                        title = i[1]
                
                pdf_title_case.append([filename, title ,paragraphs])            


State v. Gilbert (Opinion).pdf is damaged
State v. Tower (Opinion).pdf is damaged


## Formatting data from pdfs

In [316]:
cases_data = []
for pdf_case in pdf_title_case:
    
    pdf_name = pdf_case[0]
    case_title = pdf_case[1]
    case_text = pdf_case[2]
    case_text_no_newline = [p.replace('\n', ' ') for p in case_text]

    case = {'file name': pdf_name, 'title': case_title, 'type': "criminal", 'decision': "N/A",
            'text': case_text_no_newline}
    
    # locate case decision
    p_decision = 0    
    for n, paragraph in enumerate(reversed(case_text)):
        if "Conclusion" in paragraph:
            p_decision = n
            break
    
    if(p_decision == 0):
        case['decision'] = "affirmed"
    else:
        conclusion = case_text_no_newline[len(case_text_no_newline)-p_decision].lower()
        if "affirm in part" in conclusion or "granted in part" in conclusion:
            case['decision'] = "affirm in part"
        elif "affirm" in conclusion:
            case['decision'] = "affirmed"
        else:
            case['decision'] = "not affirmed"
 
    cases_data.append(case)

## Save the formatted data

In [317]:
import csv
import json

# write to a json file
with open(save_path + '/cases_ri.json', 'w') as fout:
    json.dump(cases_data, fout)

# write to a csv file
keys = cases_data[0].keys()
with open(save_path + '/cases_ri.csv', 'w') as fout:
    dict_writer = csv.DictWriter(fout, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases_data)
