In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

In [4]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Juvenile_Case_Reports/MJU as of 8 24 23.pdf'

In [5]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [6]:
text

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                                                \n\n453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS           

# Isolate Header

In [7]:
header = text[:420]
header

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                    '

# Isolate Body

In [9]:
body = text[420:]
body

'                            \n\n453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT     

In [10]:
#Remove whitespace surrounding the body and header
header = header.strip()
body = body.strip()

In [11]:
header

'COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)'

In [12]:
body

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            PEDRO MARTINEZ           

# Remove Subsequent Headers

In [19]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
#Since all current juvenile case reports are only one page, make it work on the header and then add in
# a page break at the beginning
new_body = re.sub(r"""\n\x0c\s*[A-Z0-9 \n/#\:-]*\(S\)\s*\n\n""", '', body)
new_body

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            PEDRO MARTINEZ           

# Split Into Individual Cases

In [20]:
cases = new_body.split('\n')
cases

['453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   ',
 '454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       ',
 '455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 ',
 '456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                ',
 '457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                ',
 '458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  ',
 '459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         ',
 '2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  ',
 '3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            

# Drop the Last Case

It's just the total case count

In [21]:
cases[-1]

'TOTAL JUVENILE CASES LISTED: 24'

In [22]:
cases.pop()
cases[-1]

''

# Find the Data

In [23]:
#Only work with the first case for now
case = cases[0]
case

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   '

__Key__

In [27]:
key = case[:9].strip()
key

'453'

__Cause Number__

In [32]:
cause_number = case[9:26].strip()
cause_number

'19-01-00554-MJU'

__File Date__

In [36]:
file_date = case[26:38].strip()
file_date

'01/10/2019'

__Court__

In [44]:
court = case[38:70].strip()
court

'293RD DISTRICT COURT'

__Respondent__

In [46]:
respondent = case[70:].strip()
respondent

'GUERRA'

# Build a Dataframe

In [47]:
#Establish a container list for the dictionaries
case_list = []
offense_list = []
temp_dict = {}

In [52]:
for line in cases:
    if line.isspace() or len(line) == 0:
        continue
    
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Offense'] = offense_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        offense_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[9:26].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[26:38].strip()
        
        #Get court
        #temp_dict['Court'] = line[38:70].strip()
        
        #Get respondent
        #temp_dict['Respondent'] = line[70:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get offenses
        offense = line.strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Offense'] = offense_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [54]:
len(case_list)

24

In [55]:
case_list

[{'Cause Number': '19-01-00554-MJU', 'File Date': '01/10/2019', 'Offense': []},
 {'Cause Number': '19-02-00556-MJU', 'File Date': '02/07/2019', 'Offense': []},
 {'Cause Number': '19-03-00558-MJU', 'File Date': '03/20/2019', 'Offense': []},
 {'Cause Number': '19-04-00560-MJU', 'File Date': '04/08/2019', 'Offense': []},
 {'Cause Number': '19-07-00562-MJU', 'File Date': '07/09/2019', 'Offense': []},
 {'Cause Number': '19-08-00564-MJU', 'File Date': '08/29/2019', 'Offense': []},
 {'Cause Number': '20-01-00566-MJU', 'File Date': '01/09/2020', 'Offense': []},
 {'Cause Number': '20-02-00568-MJU', 'File Date': '02/26/2020', 'Offense': []},
 {'Cause Number': '20-05-00570-MJU', 'File Date': '05/14/2020', 'Offense': []},
 {'Cause Number': '21-05-00572-MJU', 'File Date': '05/12/2021', 'Offense': []},
 {'Cause Number': '21-09-00574-MJU', 'File Date': '09/24/2021', 'Offense': []},
 {'Cause Number': '21-12-00576-MJU', 'File Date': '12/07/2021', 'Offense': []},
 {'Cause Number': '22-03-00578-MJU',
  '

In [56]:
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Offense
0,19-01-00554-MJU,01/10/2019,[]
1,19-02-00556-MJU,02/07/2019,[]
2,19-03-00558-MJU,03/20/2019,[]
3,19-04-00560-MJU,04/08/2019,[]
4,19-07-00562-MJU,07/09/2019,[]
5,19-08-00564-MJU,08/29/2019,[]
6,20-01-00566-MJU,01/09/2020,[]
7,20-02-00568-MJU,02/26/2020,[]
8,20-05-00570-MJU,05/14/2020,[]
9,21-05-00572-MJU,05/12/2021,[]


# Build the Function

In [62]:
def extract_juvenile_cases(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire juvenile cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered juvenile case info
    """
    
    #Initialize containers
    #Establish a container list for the dictionaries
    case_list = []
    offense_list = []
    temp_dict = {}
    
    #Get the header and remove surrounding whitespace
    header = text[:420].strip()

    #Get the body and remove surrounding whitespace
    body = text[420:].strip()
    
    #Get the 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{4}", header)[0]
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z0-9 \n/#\:-]*\(S\)\s*\n\n""", '', body)
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last case. It's just the total case count from the report
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    #Some fields are commented out because we don't need that info yet.
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                temp_dict['Offense'] = offense_list

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            offense_list = []
            
            #Assign the county
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[9:26].strip()

            #Gather the file date
            temp_dict['File Date'] = line[26:38].strip()

            #Get court
            #temp_dict['Court'] = line[38:70].strip()

            #Get respondent
            #temp_dict['Respondent'] = line[70:].strip()

            #End of line, so move to next one

        else:
            #Get offenses
            offense = line.strip()

            #Check if offense is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if offense.isspace() == False and len(offense) > 0:
                offense_list.append(offense.strip())

            #End of line

    #Check that the last case was added to the list
    #If not, add it
    #Add list info to temp_dict
    temp_dict['Offense'] = offense_list

    #Add temp dict data to case_list
    case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    #Add report as of date
    df["Original As Of Date"] = report_as_of_date
    df["Last As Of Date"] = report_as_of_date
    
    return df

# Test Function

In [75]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Juvenile_Case_Reports/MJU as of 8 24 23.pdf'

In [76]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [77]:
df = extract_juvenile_cases(text)

Collected Data From 24 Cases.


In [78]:
df

Unnamed: 0,County,Cause Number,File Date,Offense,Original As Of Date,Last As Of Date
0,Maverick,19-01-00554-MJU,01/10/2019,[],08/24/2023,08/24/2023
1,Maverick,19-02-00556-MJU,02/07/2019,[],08/24/2023,08/24/2023
2,Maverick,19-03-00558-MJU,03/20/2019,[],08/24/2023,08/24/2023
3,Maverick,19-04-00560-MJU,04/08/2019,[],08/24/2023,08/24/2023
4,Maverick,19-07-00562-MJU,07/09/2019,[],08/24/2023,08/24/2023
5,Maverick,19-08-00564-MJU,08/29/2019,[],08/24/2023,08/24/2023
6,Maverick,20-01-00566-MJU,01/09/2020,[],08/24/2023,08/24/2023
7,Maverick,20-02-00568-MJU,02/26/2020,[],08/24/2023,08/24/2023
8,Maverick,20-05-00570-MJU,05/14/2020,[],08/24/2023,08/24/2023
9,Maverick,21-05-00572-MJU,05/12/2021,[],08/24/2023,08/24/2023
