In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

In [2]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/24_01_22_DJU MJU ZJU _ All Counties.pdf'

In [3]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [4]:
text

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 01/22/2024 - COURT: 293RD DISTRICT COURT\n            FILING COUNTY: ALL COUNTIES - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                        DOCKET DATE  DISPO. DATE\n\n487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               \n     AGG ASSAULT W/DEADLY WEAPON                                                                               \n512      08-10-00208-ZJU  10/21/2008  293RD DISTRICT COURT            MATA JR, MAURICIO JIMENEZ                \n     BURGLARY OF HABITATION                                                                                    \n513      09-03-00212-ZJU  03/09/2009  293RD DISTRICT COURT            BALBOA, III, PABLO

# Isolate Header

In [5]:
header = text[:450]
header

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 01/22/2024 - COURT: 293RD DISTRICT COURT\n            FILING COUNTY: ALL COUNTIES - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                        DOCKET DATE  DISPO. DATE'

# Isolate Body

In [6]:
body = text[450:]
body

'\n\n487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               \n     AGG ASSAULT W/DEADLY WEAPON                                                                               \n512      08-10-00208-ZJU  10/21/2008  293RD DISTRICT COURT            MATA JR, MAURICIO JIMENEZ                \n     BURGLARY OF HABITATION                                                                                    \n513      09-03-00212-ZJU  03/09/2009  293RD DISTRICT COURT            BALBOA, III, PABLO                       \n     CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE                                                 06/10/2009\n488      09-07-00146-DJU  07/30/2009  293RD DISTRICT COURT            RUBIO, FERMIN                            \n     BURGLARY OF BUILDING                                                                                      \n514      09-08-00214-ZJU  08/20/2009  293RD DISTRICT COURT            GONZALEZ, JUAN CARLOS

In [7]:
#Remove whitespace surrounding the body and header
header = header.strip()
body = body.strip()

In [8]:
header

'COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 01/22/2024 - COURT: 293RD DISTRICT COURT\n            FILING COUNTY: ALL COUNTIES - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                        DOCKET DATE  DISPO. DATE'

In [9]:
body

'487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               \n     AGG ASSAULT W/DEADLY WEAPON                                                                               \n512      08-10-00208-ZJU  10/21/2008  293RD DISTRICT COURT            MATA JR, MAURICIO JIMENEZ                \n     BURGLARY OF HABITATION                                                                                    \n513      09-03-00212-ZJU  03/09/2009  293RD DISTRICT COURT            BALBOA, III, PABLO                       \n     CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE                                                 06/10/2009\n488      09-07-00146-DJU  07/30/2009  293RD DISTRICT COURT            RUBIO, FERMIN                            \n     BURGLARY OF BUILDING                                                                                      \n514      09-08-00214-ZJU  08/20/2009  293RD DISTRICT COURT            GONZALEZ, JUAN CARLOS    

# Remove Subsequent Headers

In [10]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
#Since all current juvenile case reports are only one page, make it work on the header and then add in
# a page break at the beginning
new_body = re.sub(r"""\n\x0c\s*[A-Z0-9 \(\)\n/#\:-]*\.\sDATE""", '', body)
new_body

'487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               \n     AGG ASSAULT W/DEADLY WEAPON                                                                               \n512      08-10-00208-ZJU  10/21/2008  293RD DISTRICT COURT            MATA JR, MAURICIO JIMENEZ                \n     BURGLARY OF HABITATION                                                                                    \n513      09-03-00212-ZJU  03/09/2009  293RD DISTRICT COURT            BALBOA, III, PABLO                       \n     CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE                                                 06/10/2009\n488      09-07-00146-DJU  07/30/2009  293RD DISTRICT COURT            RUBIO, FERMIN                            \n     BURGLARY OF BUILDING                                                                                      \n514      09-08-00214-ZJU  08/20/2009  293RD DISTRICT COURT            GONZALEZ, JUAN CARLOS    

# Split Into Individual Cases

In [11]:
cases = new_body.split('\n')
cases

['487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               ',
 '     AGG ASSAULT W/DEADLY WEAPON                                                                               ',
 '512      08-10-00208-ZJU  10/21/2008  293RD DISTRICT COURT            MATA JR, MAURICIO JIMENEZ                ',
 '     BURGLARY OF HABITATION                                                                                    ',
 '513      09-03-00212-ZJU  03/09/2009  293RD DISTRICT COURT            BALBOA, III, PABLO                       ',
 '     CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE                                                 06/10/2009',
 '488      09-07-00146-DJU  07/30/2009  293RD DISTRICT COURT            RUBIO, FERMIN                            ',
 '     BURGLARY OF BUILDING                                                                                      ',
 '514      09-08-00214-ZJU  08/20/2009  293RD DISTRICT COURT            

# Drop the Last Case

It's just the total case count

In [12]:
cases[-1]

'TOTAL JUVENILE CASES LISTED: 117'

In [13]:
cases.pop()
cases[-1]

''

# Find the Data

In [14]:
#Only work with the first case for now
case = cases[0]
case

'487      08-03-00134-DJU  03/20/2008  293RD DISTRICT COURT            HERNANDEZ, GABRIEL LEDESMA               '

__Key__

In [15]:
key = case[:9].strip()
key

'487'

__Cause Number__

In [16]:
cause_number = case[9:26].strip()
cause_number

'08-03-00134-DJU'

__File Date__

In [17]:
file_date = case[26:38].strip()
file_date

'03/20/2008'

__Court__

In [18]:
court = case[38:70].strip()
court

'293RD DISTRICT COURT'

__Respondent__

In [19]:
respondent = case[70:].strip()
respondent

'HERNANDEZ, GABRIEL LEDESMA'

In [20]:
#Find the data on the next line (Offense, Docket Date, Disposition Date)
case = cases[5]
case

'     CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE                                                 06/10/2009'

__Offense__

In [21]:
offense = case[5:88].strip()
offense

'CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE'

__Docket Date__

In [22]:
docket_date = case[88:98].strip()
docket_date

''

__Disposition Date__

In [23]:
dispo_date = case[98:].strip()
dispo_date

'06/10/2009'

# Build a Dataframe

In [24]:
#Establish a container list for the dictionaries
case_list = []
offense_list = []
docket_date_list = []
dispo_date_list = []
temp_dict = {}

In [25]:
for line in cases:
    if line.isspace() or len(line) == 0:
        continue
    
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Offense'] = offense_list
            temp_dict['Docket Date'] = docket_date_list
            temp_dict['Disposition Date'] = dispo_date_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        offense_list = []
        docket_date_list = []
        dispo_date_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[9:26].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[26:38].strip()
        
        #Get court
        #temp_dict['Court'] = line[38:70].strip()
        
        #Get respondent
        #temp_dict['Respondent'] = line[70:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get offenses
        offense = line[5:88].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
            
        #Get Docket Date
        docket_date = line[88:98].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if docket_date.isspace() == False and len(docket_date) > 0:
            docket_date_list.append(docket_date.strip())
            
        #Get Disposition Date
        dispo_date = line[98:].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if dispo_date.isspace() == False and len(dispo_date) > 0:
            dispo_date_list.append(dispo_date.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Offense'] = offense_list
temp_dict['Docket Date'] = docket_date_list
temp_dict['Disposition Date'] = dispo_date_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [26]:
len(case_list)

117

In [27]:
case_list

[{'Cause Number': '08-03-00134-DJU',
  'File Date': '03/20/2008',
  'Offense': ['AGG ASSAULT W/DEADLY WEAPON'],
  'Docket Date': [],
  'Disposition Date': []},
 {'Cause Number': '08-10-00208-ZJU',
  'File Date': '10/21/2008',
  'Offense': ['BURGLARY OF HABITATION'],
  'Docket Date': [],
  'Disposition Date': []},
 {'Cause Number': '09-03-00212-ZJU',
  'File Date': '03/09/2009',
  'Offense': ['CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE'],
  'Docket Date': [],
  'Disposition Date': ['06/10/2009']},
 {'Cause Number': '09-07-00146-DJU',
  'File Date': '07/30/2009',
  'Offense': ['BURGLARY OF BUILDING'],
  'Docket Date': [],
  'Disposition Date': []},
 {'Cause Number': '09-08-00214-ZJU',
  'File Date': '08/20/2009',
  'Offense': ['BURGLARY OF HABITATION'],
  'Docket Date': [],
  'Disposition Date': ['10/05/2009']},
 {'Cause Number': '09-08-00216-ZJU',
  'File Date': '08/20/2009',
  'Offense': ['BURGLARY OF HABITATION'],
  'Docket Date': [],
  'Disposition Date': ['10/05/2009']},
 {'Cau

In [28]:
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Offense,Docket Date,Disposition Date
0,08-03-00134-DJU,03/20/2008,[AGG ASSAULT W/DEADLY WEAPON],[],[]
1,08-10-00208-ZJU,10/21/2008,[BURGLARY OF HABITATION],[],[]
2,09-03-00212-ZJU,03/09/2009,"[CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE]",[],[06/10/2009]
3,09-07-00146-DJU,07/30/2009,[BURGLARY OF BUILDING],[],[]
4,09-08-00214-ZJU,08/20/2009,[BURGLARY OF HABITATION],[],[10/05/2009]
...,...,...,...,...,...
112,23-05-0222-DJU,05/04/2023,[UNAUTH USE OF VEHICLE],[],[]
113,23-05-00590-MJU,05/08/2023,"[ASSAULT PEACE OFFICER/JUDGE, EVADING ARREST D...",[06/03/2024],[]
114,23-07-0224-DJU,07/26/2023,[SMUGGLING OF PERSONS: FIREARM],[],[]
115,23-07-00592-MJU,07/29/2023,[EVADING ARREST DET W/VEH],[04/04/2024],[]


# Build the Function

In [37]:
def extract_juvenile_cases(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire juvenile cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered juvenile case info
    """
    
    #Initialize containers
    #Establish a container list for the dictionaries
    case_list = []
    offense_list = []
    docket_date_list = []
    dispo_date_list = []
    temp_dict = {}
    
    #Get the header and remove surrounding whitespace
    header = text[:450].strip()

    #Get the body and remove surrounding whitespace
    body = text[450:].strip()
    
    #Get the 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{4}", header)[0]
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z0-9 \(\)\n/#\:-]*\.\sDATE""", '', body)
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last case. It's just the total case count from the report
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    #Some fields are commented out because we don't need that info yet.
    for line in cases:
        if line.isspace() or len(line) == 0:
            continue

        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                temp_dict['Offense'] = offense_list
                temp_dict['Docket Date'] = docket_date_list
                temp_dict['Disposition Date'] = dispo_date_list

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            offense_list = []
            docket_date_list = []
            dispo_date_list = []

            #Gather the cause number
            temp_dict['Cause Number'] = line[9:26].strip()

            #Gather the file date
            temp_dict['File Date'] = line[26:38].strip()

            #Get court
            #temp_dict['Court'] = line[38:70].strip()

            #Get respondent
            #temp_dict['Respondent'] = line[70:].strip()

            #End of line, so move to next one

        else:
            #Get offenses
            offense = line[5:88].strip()

            #Check if offense is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if offense.isspace() == False and len(offense) > 0:
                offense_list.append(offense.strip())

            #Get Docket Date
            docket_date = line[88:98].strip()

            #Check if offense is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if docket_date.isspace() == False and len(docket_date) > 0:
                docket_date_list.append(docket_date.strip())

            #Get Disposition Date
            dispo_date = line[98:].strip()

            #Check if offense is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if dispo_date.isspace() == False and len(dispo_date) > 0:
                dispo_date_list.append(dispo_date.strip())

            #End of line
        
    #Check that the last case was added to the list
    #Add list info to temp_dict
    temp_dict['Offense'] = offense_list
    temp_dict['Docket Date'] = docket_date_list
    temp_dict['Disposition Date'] = dispo_date_list

    #Add temp dict data to case_list
    case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    #Add report as of date
    df["Original As Of Date"] = report_as_of_date
    df["Last As Of Date"] = report_as_of_date
    df["Report Generated Date"] = report_as_of_date
    
    return df

# Test Function

In [38]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/24_01_22_DJU MJU ZJU _ All Counties.pdf'

In [39]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [40]:
df = extract_juvenile_cases(text)

Collected Data From 117 Cases.


In [41]:
df

Unnamed: 0,Cause Number,File Date,Offense,Docket Date,Disposition Date,Original As Of Date,Last As Of Date,Report Generated Date
0,08-03-00134-DJU,03/20/2008,[AGG ASSAULT W/DEADLY WEAPON],[],[],01/22/2024,01/22/2024,01/22/2024
1,08-10-00208-ZJU,10/21/2008,[BURGLARY OF HABITATION],[],[],01/22/2024,01/22/2024,01/22/2024
2,09-03-00212-ZJU,03/09/2009,"[CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE]",[],[06/10/2009],01/22/2024,01/22/2024,01/22/2024
3,09-07-00146-DJU,07/30/2009,[BURGLARY OF BUILDING],[],[],01/22/2024,01/22/2024,01/22/2024
4,09-08-00214-ZJU,08/20/2009,[BURGLARY OF HABITATION],[],[10/05/2009],01/22/2024,01/22/2024,01/22/2024
...,...,...,...,...,...,...,...,...
112,23-05-0222-DJU,05/04/2023,[UNAUTH USE OF VEHICLE],[],[],01/22/2024,01/22/2024,01/22/2024
113,23-05-00590-MJU,05/08/2023,"[ASSAULT PEACE OFFICER/JUDGE, EVADING ARREST D...",[06/03/2024],[],01/22/2024,01/22/2024,01/22/2024
114,23-07-0224-DJU,07/26/2023,[SMUGGLING OF PERSONS: FIREARM],[],[],01/22/2024,01/22/2024,01/22/2024
115,23-07-00592-MJU,07/29/2023,[EVADING ARREST DET W/VEH],[04/04/2024],[],01/22/2024,01/22/2024,01/22/2024


# Prepare Dataframe

__Get County__

The juvenile case report contains cases for all three counties, so I'll need to apply a function to determine the county based on the Cause Number.

In [42]:
def get_juvenile_case_county(cause_number):
    """
    The juvenile case report contains cases for all three counties,
    so I'll need to apply a function to determine the county based on the Cause Number.
    
    Parameter:
        - cause_number: A string representing the case cause number.
        
    Returns:
        - county: A string representing the county the case belongs to.
    """
    
    #Get the final section of text in the cause number
    final_section = cause_number[11:]
    
    if final_section.count('MJU') == 1:
        return 'Maverick'
    elif final_section.count('DJU') == 1:
        return 'Dimmit'
    elif final_section.count('ZJU') == 1:
        return 'Zavala'

In [43]:
#Test county function
df['County'] = df['Cause Number'].apply(get_juvenile_case_county)
df

Unnamed: 0,Cause Number,File Date,Offense,Docket Date,Disposition Date,Original As Of Date,Last As Of Date,Report Generated Date,County
0,08-03-00134-DJU,03/20/2008,[AGG ASSAULT W/DEADLY WEAPON],[],[],01/22/2024,01/22/2024,01/22/2024,Dimmit
1,08-10-00208-ZJU,10/21/2008,[BURGLARY OF HABITATION],[],[],01/22/2024,01/22/2024,01/22/2024,Zavala
2,09-03-00212-ZJU,03/09/2009,"[CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE]",[],[06/10/2009],01/22/2024,01/22/2024,01/22/2024,Zavala
3,09-07-00146-DJU,07/30/2009,[BURGLARY OF BUILDING],[],[],01/22/2024,01/22/2024,01/22/2024,Dimmit
4,09-08-00214-ZJU,08/20/2009,[BURGLARY OF HABITATION],[],[10/05/2009],01/22/2024,01/22/2024,01/22/2024,Zavala
...,...,...,...,...,...,...,...,...,...
112,23-05-0222-DJU,05/04/2023,[UNAUTH USE OF VEHICLE],[],[],01/22/2024,01/22/2024,01/22/2024,Dimmit
113,23-05-00590-MJU,05/08/2023,"[ASSAULT PEACE OFFICER/JUDGE, EVADING ARREST D...",[06/03/2024],[],01/22/2024,01/22/2024,01/22/2024,Maverick
114,23-07-0224-DJU,07/26/2023,[SMUGGLING OF PERSONS: FIREARM],[],[],01/22/2024,01/22/2024,01/22/2024,Dimmit
115,23-07-00592-MJU,07/29/2023,[EVADING ARREST DET W/VEH],[04/04/2024],[],01/22/2024,01/22/2024,01/22/2024,Maverick
