In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

# Load Disposed Cases File

In [5]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Disposed_Case_Reports/Disposed CR Dimmit 01012019 thru 08242023.pdf'

In [113]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [114]:
text

"                                AUGUST 24, 2023 06:31pm  DIMMIT DISTRICT CLERK OFFICE 830-876-4243\n                                           CASES DISPOSED - COURT: 293RD DISTRICT COURT\n                                                01/01/2019 TO 08/24/2023 - PAGE 1\n\nCAUSE #          DEFENDANT NAME              DIS DT/S  DISPOSITION/S            COMPLAINANT                   BONDSMAN                                   \n\n09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|\n10-03-02566-DCR  NUNN, NATALIA RIVERA        02/18/22  ALL OTHER DISPOSITIONS                                 SAENZ BAIL BOND\n10-05-02568-DCR  PERALES, JEFFREY LEE        03/08/21  DISMISSALS                                             |----|\n11-06-02622-DCR  FLORES, JESSE               06/23/21  MOTIONS TO REVOKE/ADJUD                                SAENZ BAIL BOND\n                                             03/08/23  DISMISSALS             

# Isolate Header

In [115]:
header = text[:420]
header

'                                AUGUST 24, 2023 06:31pm  DIMMIT DISTRICT CLERK OFFICE 830-876-4243\n                                           CASES DISPOSED - COURT: 293RD DISTRICT COURT\n                                                01/01/2019 TO 08/24/2023 - PAGE 1\n\nCAUSE #          DEFENDANT NAME              DIS DT/S  DISPOSITION/S            COMPLAINANT                   BONDSMAN                                '

# Isolate Body

In [116]:
body = text[420:]
body

"   \n\n09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|\n10-03-02566-DCR  NUNN, NATALIA RIVERA        02/18/22  ALL OTHER DISPOSITIONS                                 SAENZ BAIL BOND\n10-05-02568-DCR  PERALES, JEFFREY LEE        03/08/21  DISMISSALS                                             |----|\n11-06-02622-DCR  FLORES, JESSE               06/23/21  MOTIONS TO REVOKE/ADJUD                                SAENZ BAIL BOND\n                                             03/08/23  DISMISSALS             \n11-10-02632-DCR  MARTINEZ, WILLIE JOE        04/22/19  MOTIONS TO REVOKE/ADJUD                                |----|\n12-10-02696-DCR  BALDERAS, EVELYN MICHELLE   09/01/21  ALL OTHER DISPOSITIONS                                 LIBERTAD BAIL B\n12-12-02724-DCR  BREWER, RICHARD LEE         04/22/20                                                         |----|\n13-04-02756-DCR  DIAZ, BRIAN                 07/13/22  MOTI

In [117]:
#Remove whitespace surrounding the body and header
header = header.strip()
body = body.strip()

In [118]:
header

'AUGUST 24, 2023 06:31pm  DIMMIT DISTRICT CLERK OFFICE 830-876-4243\n                                           CASES DISPOSED - COURT: 293RD DISTRICT COURT\n                                                01/01/2019 TO 08/24/2023 - PAGE 1\n\nCAUSE #          DEFENDANT NAME              DIS DT/S  DISPOSITION/S            COMPLAINANT                   BONDSMAN'

In [119]:
body

"09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|\n10-03-02566-DCR  NUNN, NATALIA RIVERA        02/18/22  ALL OTHER DISPOSITIONS                                 SAENZ BAIL BOND\n10-05-02568-DCR  PERALES, JEFFREY LEE        03/08/21  DISMISSALS                                             |----|\n11-06-02622-DCR  FLORES, JESSE               06/23/21  MOTIONS TO REVOKE/ADJUD                                SAENZ BAIL BOND\n                                             03/08/23  DISMISSALS             \n11-10-02632-DCR  MARTINEZ, WILLIE JOE        04/22/19  MOTIONS TO REVOKE/ADJUD                                |----|\n12-10-02696-DCR  BALDERAS, EVELYN MICHELLE   09/01/21  ALL OTHER DISPOSITIONS                                 LIBERTAD BAIL B\n12-12-02724-DCR  BREWER, RICHARD LEE         04/22/20                                                         |----|\n13-04-02756-DCR  DIAZ, BRIAN                 07/13/22  MOTIONS TO 

# Remove Subsequent Headers

In [120]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
new_body = re.sub(r"""\n\x0c\s*[A-Z]{3,9}\s[0-9]{1,2},\s[0-9]{4}\s[a-zA-Z0-9 \n:/-]*#\s*[A-Z /]*\n\n""", '', body)
new_body

"09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|\n10-03-02566-DCR  NUNN, NATALIA RIVERA        02/18/22  ALL OTHER DISPOSITIONS                                 SAENZ BAIL BOND\n10-05-02568-DCR  PERALES, JEFFREY LEE        03/08/21  DISMISSALS                                             |----|\n11-06-02622-DCR  FLORES, JESSE               06/23/21  MOTIONS TO REVOKE/ADJUD                                SAENZ BAIL BOND\n                                             03/08/23  DISMISSALS             \n11-10-02632-DCR  MARTINEZ, WILLIE JOE        04/22/19  MOTIONS TO REVOKE/ADJUD                                |----|\n12-10-02696-DCR  BALDERAS, EVELYN MICHELLE   09/01/21  ALL OTHER DISPOSITIONS                                 LIBERTAD BAIL B\n12-12-02724-DCR  BREWER, RICHARD LEE         04/22/20                                                         |----|\n13-04-02756-DCR  DIAZ, BRIAN                 07/13/22  MOTIONS TO 

# Split Into Individual Cases

In [121]:
cases = new_body.split('\n')
cases

['09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|',
 '10-03-02566-DCR  NUNN, NATALIA RIVERA        02/18/22  ALL OTHER DISPOSITIONS                                 SAENZ BAIL BOND',
 '10-05-02568-DCR  PERALES, JEFFREY LEE        03/08/21  DISMISSALS                                             |----|',
 '11-06-02622-DCR  FLORES, JESSE               06/23/21  MOTIONS TO REVOKE/ADJUD                                SAENZ BAIL BOND',
 '                                             03/08/23  DISMISSALS             ',
 '11-10-02632-DCR  MARTINEZ, WILLIE JOE        04/22/19  MOTIONS TO REVOKE/ADJUD                                |----|',
 '12-10-02696-DCR  BALDERAS, EVELYN MICHELLE   09/01/21  ALL OTHER DISPOSITIONS                                 LIBERTAD BAIL B',
 '12-12-02724-DCR  BREWER, RICHARD LEE         04/22/20                                                         |----|',
 '13-04-02756-DCR  DIAZ, BRIAN             

# Drop the Last Case

It's just the total case count

In [122]:
cases[-1]

'134 CASES LISTED'

In [123]:
cases.pop()
cases[-1]

''

# Find the Data

In [124]:
#Only work with the first case for now
case = cases[0]
case

'09-05-02506-DCR  ALEXANDRAVICE, KATHERINE L  10/10/19  MOTIONS TO REVOKE/ADJUD                                |----|'

__Cause Number__

In [125]:
cause_number = case[:17].strip()
cause_number

'09-05-02506-DCR'

__Defendant Name__

In [126]:
defendant = case[17:45].strip()
defendant

'ALEXANDRAVICE, KATHERINE L'

__Disposed Date__

In [127]:
disposed_date = case[45:55].strip()
disposed_date

'10/10/19'

__Disposition__

In [128]:
disposition = case[55:78].strip()
disposition

'MOTIONS TO REVOKE/ADJUD'

__Complainant__

In [129]:
complainant = case[78:110].strip()
complainant

''

__Bondsman__

In [130]:
bondsman = case[110:].strip()
bondsman

'|----|'

# Build a Dataframe

In [138]:
#Establish a container list for the dictionaries
case_list = []
dispo_dates_list = []
#disposition_list = []
temp_dict = {}

In [139]:
for line in cases:
    if line.isspace() or len(line) == 0:
        continue
    
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Disposed Dates'] = dispo_dates_list
            #temp_dict['Dispositions'] = disposition_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        dispo_dates_list = []
        #disposition_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:17].strip()
        
        #Gather the defendant name
        #temp_dict['Defendant'] = line[17:45].strip()
        
        #Get first dispo date
        dispo_dates_list.append(line[45:55].strip())
        
        #Get first disposition
        #disposition_list.append(line[55:78].strip())
        
        #Get complainant
        #temp_dict['Complainant'] = line[78:110].strip()
        
        #Get bondsman
        #temp_dict['Bondsman'] = line[110:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get additional dispo date
        dispo_date = line[45:55].strip()
        
        #Check if dispo_date is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if dispo_date.isspace() == False and len(dispo_date) > 0:
            dispo_dates_list.append(dispo_date.strip())
        
        #Get additional disposition
        #disposition = line[55:78].strip()
        
        #Check if disposition is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        #if disposition.isspace() == False and len(disposition) > 0:
            #disposition_list.append(disposition.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Disposed Dates'] = dispo_dates_list
#temp_dict['Dispositions'] = disposition_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [140]:
len(case_list)

134

In [141]:
case_list

[{'Cause Number': '09-05-02506-DCR', 'Disposed Dates': ['10/10/19']},
 {'Cause Number': '10-03-02566-DCR', 'Disposed Dates': ['02/18/22']},
 {'Cause Number': '10-05-02568-DCR', 'Disposed Dates': ['03/08/21']},
 {'Cause Number': '11-06-02622-DCR',
  'Disposed Dates': ['06/23/21', '03/08/23']},
 {'Cause Number': '11-10-02632-DCR', 'Disposed Dates': ['04/22/19']},
 {'Cause Number': '12-10-02696-DCR', 'Disposed Dates': ['09/01/21']},
 {'Cause Number': '12-12-02724-DCR', 'Disposed Dates': ['04/22/20']},
 {'Cause Number': '13-04-02756-DCR', 'Disposed Dates': ['07/13/22']},
 {'Cause Number': '13-10-02816-DCR', 'Disposed Dates': ['06/09/23']},
 {'Cause Number': '13-10-02817-DCR', 'Disposed Dates': ['06/09/23']},
 {'Cause Number': '13-10-02818-DCR', 'Disposed Dates': ['05/12/23']},
 {'Cause Number': '13-12-02834-DCR', 'Disposed Dates': ['06/02/23']},
 {'Cause Number': '14-05-02918-DCR',
  'Disposed Dates': ['03/06/19', '03/06/19']},
 {'Cause Number': '14-05-02920-DCR', 'Disposed Dates': ['09/10

In [142]:
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,Disposed Dates
0,09-05-02506-DCR,[10/10/19]
1,10-03-02566-DCR,[02/18/22]
2,10-05-02568-DCR,[03/08/21]
3,11-06-02622-DCR,"[06/23/21, 03/08/23]"
4,11-10-02632-DCR,[04/22/19]
...,...,...
129,21-11-03422-DCR,[04/14/23]
130,22-03-03432-DCR,[02/02/23]
131,22-11-03472-DCR,[07/17/23]
132,23-02-03490-DCR,[05/18/23]


# Build the Function

In [3]:
def extract_criminal_disposed_cases(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire disposed cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered disposed case info
    """
    
    #Initialize containers
    #Establish a container list for the dictionaries
    case_list = []
    dispo_dates_list = []
    disposition_list = []
    temp_dict = {}
    
    #Add a var to count the number of dispositions
    dispo_count = 0
    
    #Get the header and remove surrounding whitespace
    header = text[:420].strip()

    #Get the body and remove surrounding whitespace
    body = text[420:].strip()
    
    #Get the 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{4}", header)[1]
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z]{3,9}\s[0-9]{1,2},\s[0-9]{4}\s[a-zA-Z0-9 \n:/-]*#\s*[A-Z /]*\n\n""", '', body)
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last case. It's just the total case count from the report
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    #Most fields are commented out because we don't need that info yet.
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                temp_dict['Disposed Dates'] = dispo_dates_list
                temp_dict['Dispositions'] = disposition_list
                dispo_count += len(dispo_dates_list)
                
                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            dispo_dates_list = []
            disposition_list = []
            
            #Assign the county name
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[:17].strip()

            #Gather the defendant name
            #temp_dict['Defendant'] = line[17:45].strip()

            #Get first dispo date
            dispo_dates_list.append(line[45:55].strip())

            #Get first disposition
            disposition_list.append(line[55:78].strip())

            #Get complainant
            #temp_dict['Complainant'] = line[78:110].strip()

            #Get bondsman
            #temp_dict['Bondsman'] = line[110:].strip()

            #End of line, so move to next one

        else:
            #Get additional dispo date
            dispo_date = line[45:55].strip()

            #Check if dispo_date is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if dispo_date.isspace() == False and len(dispo_date) > 0:
                dispo_dates_list.append(dispo_date.strip())

            #Get additional disposition
            disposition = line[55:78].strip()

            #Check if disposition is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if disposition.isspace() == False and len(disposition) > 0:
                disposition_list.append(disposition.strip())

            #End of line

    #Check that the last case was added to the list
    #If not, add it
    #Add list info to temp_dict
    temp_dict['Disposed Dates'] = dispo_dates_list
    temp_dict['Dispositions'] = disposition_list
    dispo_count += len(dispo_dates_list)
    
    #Add temp dict data to case_list
    case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    print(f'There were {dispo_count} disposed cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    #Add report as of date
    df['Disposed As Of Date'] = report_as_of_date
    
    return df

# Test Function

In [14]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Disposed_Case_Reports/Disposed CR Zavala 01012019 thru 08242023.pdf'

In [15]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [16]:
df = extract_criminal_disposed_cases(text)

Collected Data From 113 Cases.
There were 113 disposed cases.


In [17]:
df

Unnamed: 0,County,Cause Number,Disposed Dates,Dispositions,Disposed As Of Date
0,Zavala,00-00-00000-ZCR,[09/16/22],[ALL OTHER DISPOSITIONS],08/24/2023
1,Zavala,05-11-02996-ZCR,[07/08/19],[DISMISSALS],08/24/2023
2,Zavala,09-02-03168-ZCR,[02/17/22],[DISMISSALS],08/24/2023
3,Zavala,12-01-03315-ZCR,[06/06/19],[ALL OTHER DISPOSITIONS],08/24/2023
4,Zavala,12-03-03336-ZCR,[02/01/19],[CONVICTIONS BY THE JURY],08/24/2023
...,...,...,...,...,...
108,Zavala,22-03-03902-ZCR,[04/06/23],[CONVICTIONS BY THE COUR],08/24/2023
109,Zavala,22-03-03906-ZCR,[04/13/23],[CONVICTIONS BY THE COUR],08/24/2023
110,Zavala,22-05-03908-ZCR,[03/09/23],[CONVICTIONS BY THE COUR],08/24/2023
111,Zavala,22-11-03972-ZCR,[05/08/23],[CONVICTIONS BY THE COUR],08/24/2023
