In [151]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

In [152]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Disposed_Case_Reports/Disposed CV Maverick 01012019 thru 08242023.pdf'

In [153]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [154]:
text

'            AUGUST 24, 2023 06:24pm - MAVERICK DISTRICT CLERK OFFICE - 830-773-2629\n            CIVIL DISPOSED CASES - 01/01/19 TO 08/24/23 - INCLUDE CONFIDENTIAL CASES\n                                       COURT: 293 - PAGE 1\n\nCAUSE NO.        DISPO DT/S  DISPOSITION/S                      CAUSE OF ACTION/REASON REOPENED \n                 FILE DATE   PLAINTIFF                          DEFENDANT                       \n\n18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            \n                                                                REACTIVATE                      \n                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  \n                             ATTORNEY GENERAL                                                   \n23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        \n                                                                ORIGINAL PETITION       

# Isolate Header

In [155]:
header = text[:420]
header

'            AUGUST 24, 2023 06:24pm - MAVERICK DISTRICT CLERK OFFICE - 830-773-2629\n            CIVIL DISPOSED CASES - 01/01/19 TO 08/24/23 - INCLUDE CONFIDENTIAL CASES\n                                       COURT: 293 - PAGE 1\n\nCAUSE NO.        DISPO DT/S  DISPOSITION/S                      CAUSE OF ACTION/REASON REOPENED \n                 FILE DATE   PLAINTIFF                          DEFENDANT                     '

# Isolate Body

In [156]:
body = text[420:]
body

'  \n\n18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            \n                                                                REACTIVATE                      \n                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  \n                             ATTORNEY GENERAL                                                   \n23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        \n                                                                ORIGINAL PETITION               \n                 02/14/2023  MARTINEZ JAIME NICANOR             ALVARADO CRYSTAL LEE            \n23-02-41912-MCV  06/07/2023  ALL OTHER DISPOSITIONS             23-DIVORCE - CHILDREN           \n                                                                ORIGINAL PETITION               \n                 02/23/2023  LINDA RAMOS MEDRANO                EDUARDO SILVA MEDRANO           \n                       

In [157]:
#Remove whitespace surrounding the body and header
header = header.strip()
body = body.strip()

In [158]:
header

'AUGUST 24, 2023 06:24pm - MAVERICK DISTRICT CLERK OFFICE - 830-773-2629\n            CIVIL DISPOSED CASES - 01/01/19 TO 08/24/23 - INCLUDE CONFIDENTIAL CASES\n                                       COURT: 293 - PAGE 1\n\nCAUSE NO.        DISPO DT/S  DISPOSITION/S                      CAUSE OF ACTION/REASON REOPENED \n                 FILE DATE   PLAINTIFF                          DEFENDANT'

In [159]:
body

'18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            \n                                                                REACTIVATE                      \n                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  \n                             ATTORNEY GENERAL                                                   \n23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        \n                                                                ORIGINAL PETITION               \n                 02/14/2023  MARTINEZ JAIME NICANOR             ALVARADO CRYSTAL LEE            \n23-02-41912-MCV  06/07/2023  ALL OTHER DISPOSITIONS             23-DIVORCE - CHILDREN           \n                                                                ORIGINAL PETITION               \n                 02/23/2023  LINDA RAMOS MEDRANO                EDUARDO SILVA MEDRANO           \n                             

# Remove Subsequent Headers

In [160]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
#Can't include the page break in this regex because the formatting is awful
new_body = re.sub(r"""\s*[A-Z]{3,9}\s[0-9]{1,2},\s[0-9]{4}\s[a-zA-Z0-9 \n/-]*:[a-zA-Z0-9 \n/-]*:[A-Za-z0-9 \n\./-]*DEFENDANT\s{1,23}\n""", '', body)
new_body

'18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            \n                                                                REACTIVATE                      \n                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  \n                             ATTORNEY GENERAL                                                   \n23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        \n                                                                ORIGINAL PETITION               \n                 02/14/2023  MARTINEZ JAIME NICANOR             ALVARADO CRYSTAL LEE            \n23-02-41912-MCV  06/07/2023  ALL OTHER DISPOSITIONS             23-DIVORCE - CHILDREN           \n                                                                ORIGINAL PETITION               \n                 02/23/2023  LINDA RAMOS MEDRANO                EDUARDO SILVA MEDRANO           \n                             

In [161]:
#Since the formatting is awful, manually remove the page break symbol '\n\x0c'
new_body = new_body.replace('\n\x0c','')
new_body

'18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            \n                                                                REACTIVATE                      \n                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  \n                             ATTORNEY GENERAL                                                   \n23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        \n                                                                ORIGINAL PETITION               \n                 02/14/2023  MARTINEZ JAIME NICANOR             ALVARADO CRYSTAL LEE            \n23-02-41912-MCV  06/07/2023  ALL OTHER DISPOSITIONS             23-DIVORCE - CHILDREN           \n                                                                ORIGINAL PETITION               \n                 02/23/2023  LINDA RAMOS MEDRANO                EDUARDO SILVA MEDRANO           \n                             

# Split Into Individual Cases

In [162]:
cases = new_body.split('\n')
cases

['18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            ',
 '                                                                REACTIVATE                      ',
 '                   AGUILERA, ALEXIS                   MORENO, ISRAEL                  ',
 '                             ATTORNEY GENERAL                                                   ',
 '23-02-41868-MCV  06/02/2023  ALL OTHER DISPOSITIONS             22-DIVORCE - NO CHILDREN        ',
 '                                                                ORIGINAL PETITION               ',
 '                 02/14/2023  MARTINEZ JAIME NICANOR             ALVARADO CRYSTAL LEE            ',
 '23-02-41912-MCV  06/07/2023  ALL OTHER DISPOSITIONS             23-DIVORCE - CHILDREN           ',
 '                                                                ORIGINAL PETITION               ',
 '                 02/23/2023  LINDA RAMOS MEDRANO                EDUARDO SILVA MEDRANO           ',


# Drop the Last Two Cases

They're just total case counts.

In [163]:
cases[-1]

'NUMBER OF DISPOSITIONS:  4376'

In [164]:
cases[-2]

'NUMBER OF CASES:  3799'

In [165]:
cases.pop()
cases.pop()

'NUMBER OF CASES:  3799'

In [166]:
cases[-1]

''

# Find the Data

In [167]:
#Only work with the first case for now
case = cases[0]
case

'18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE - CHILDREN            '

__Cause Number__

In [168]:
cause_number = case[:17].strip()
cause_number

'18-04-35458-MCV'

__Disposition Date__

In [169]:
defendant = case[17:29].strip()
defendant

'07/23/2019'

__Disposition__

In [170]:
disposition = case[29:64].strip()
disposition

'P'

__Cause of Action__

In [171]:
coa = case[64:].strip()
coa

'1-DIVORCE - CHILDREN'

# Build a Dataframe

In [173]:
#Establish a container list for the dictionaries
case_list = []
dispo_dates_list = []
disposition_list = []
coa_list = []
temp_dict = {}
#Create a var for counting the number of disposed dates
dispo_count = 0

In [174]:
for line in cases:
    if line.isspace() or len(line) == 0:
        continue
    
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Save the last date as the case file date
            temp_dict['File Date'] = dispo_dates_list[-1]
            
            #Now get the length of the list and remove the last date from the dispo dates list
            file_date_starting_line = len(dispo_dates_list)
            dispo_dates_list.pop()
            dispo_count += len(dispo_dates_list)
            #Add disposed dates to temp_dict
            temp_dict['Disposed Dates'] = dispo_dates_list
            
            temp_dict['Dispositions'] = disposition_list[:file_date_starting_line - 1]
            
            #temp_dict['Causes of Action'] = coa_list[:file_date_starting_line - 1]
            
            #temp_dict['Plaintiffs'] = disposition_list[file_date_starting_line - 1:]
            
            #temp_dict['Defendants'] = coa_list[file_date_starting_line - 1:]
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        dispo_dates_list = []
        disposition_list = []
        #coa_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:17].strip()
        
        #Get the first dispo date
        dispo_dates_list.append(line[17:29].strip())
        
        #Get first disposition
        disposition_list.append(line[29:64].strip())
        
        #Get first coa
        #coa_list.append(line[64:].strip())
        
        #End of line, so move to next one
    
    else:
        #Get additional dispo date
        dispo_date = line[17:29].strip()
        
        #Check if dispo_date is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if dispo_date.isspace() == False and len(dispo_date) > 0:
            dispo_dates_list.append(dispo_date.strip())
        
        #Get additional disposition
        disposition = line[29:64].strip()
        
        #Check if disposition is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if disposition.isspace() == False and len(disposition) > 0:
            disposition_list.append(disposition.strip())
            
        #Get additional coa
        #coa = line[64:].strip()
        
        #Check if coa is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        #if coa.isspace() == False and len(coa) > 0:
            #coa_list.append(coa.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Save the last date as the case file date
temp_dict['File Date'] = dispo_dates_list[-1]

#Now get the length of the list and remove the last date from the dispo dates list
file_date_starting_line = len(dispo_dates_list)
dispo_dates_list.pop()
dispo_count += len(dispo_dates_list)
#Add disposed dates to temp_dict
temp_dict['Disposed Dates'] = dispo_dates_list

temp_dict['Dispositions'] = disposition_list[:file_date_starting_line - 1]

#temp_dict['Causes of Action'] = coa_list[:file_date_starting_line - 1]

#temp_dict['Plaintiffs'] = disposition_list[file_date_starting_line - 1:]

#temp_dict['Defendants'] = coa_list[file_date_starting_line - 1:]

#Add temp dict data to case_list
case_list.append(temp_dict)

In [175]:
len(case_list)

3799

In [176]:
dispo_count

4376

In [177]:
case_list

[{'Cause Number': '18-04-35458-MCV',
  'File Date': 'AGUILERA,',
  'Disposed Dates': ['07/23/2019'],
  'Dispositions': ['P']},
 {'Cause Number': '23-02-41868-MCV',
  'File Date': '02/14/2023',
  'Disposed Dates': ['06/02/2023'],
  'Dispositions': ['ALL OTHER DISPOSITIONS']},
 {'Cause Number': '23-02-41912-MCV',
  'File Date': '02/23/2023',
  'Disposed Dates': ['06/07/2023'],
  'Dispositions': ['ALL OTHER DISPOSITIONS']},
 {'Cause Number': '01-08-17530-CV',
  'File Date': '08/09/2001',
  'Disposed Dates': ['02/18/2020', '05/08/2023'],
  'Dispositions': ['ALL OTHER DISPOSITIONS', 'ALL OTHER DISPOSITIONS']},
 {'Cause Number': '01-07-17452-CV',
  'File Date': '07/03/2001',
  'Disposed Dates': ['03/10/2020'],
  'Dispositions': ['P']},
 {'Cause Number': '03-02-19006-MCV',
  'File Date': '02/25/2003',
  'Disposed Dates': ['05/17/2019'],
  'Dispositions': ['P']},
 {'Cause Number': '02-10-18688-MCV',
  'File Date': '10/11/2002',
  'Disposed Dates': ['08/20/2019'],
  'Dispositions': ['P']},
 {'C

In [178]:
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Disposed Dates,Dispositions
0,18-04-35458-MCV,"AGUILERA,",[07/23/2019],[P]
1,23-02-41868-MCV,02/14/2023,[06/02/2023],[ALL OTHER DISPOSITIONS]
2,23-02-41912-MCV,02/23/2023,[06/07/2023],[ALL OTHER DISPOSITIONS]
3,01-08-17530-CV,08/09/2001,"[02/18/2020, 05/08/2023]","[ALL OTHER DISPOSITIONS, ALL OTHER DISPOSITIONS]"
4,01-07-17452-CV,07/03/2001,[03/10/2020],[P]
...,...,...,...,...
3794,20-07-39038-MCV,07/21/2020,[08/15/2023],[CASES NON-SUITED OR DISMISSED BY]
3795,22-07-41232-MCV,07/25/2022,[08/15/2023],[CASES NON-SUITED OR DISMISSED BY]
3796,23-03-42040-MCV,03/31/2023,"[08/15/2023, 08/15/2023]","[CASES NON-SUITED OR DISMISSED BY, CASES NON-S..."
3797,23-04-42060-MCV,04/04/2023,"[08/18/2023, 08/18/2023]","[ALL OTHER DISPOSITIONS, ALL OTHER DISPOSITIONS]"


# Build the Function

In [197]:
def extract_civil_disposed_cases(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire disposed cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered disposed case info
    """
    
    #Initialize containers
    #Establish a container list for the dictionaries
    case_list = []
    dispo_dates_list = []
    disposition_list = []
    #coa_list = []
    temp_dict = {}
    
    #Create a var to count the number of disposed dates
    dispo_count = 0
    
    #Get the header and remove surrounding whitespace
    header = text[:420].strip()

    #Get the body and remove surrounding whitespace
    body = text[420:].strip()
    
    #Get the report 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{2}", header)[1]
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    #Can't include the page break in this regex because the formatting is awful
    body = re.sub(r"""\s*[A-Z]{3,9}\s[0-9]{1,2},\s[0-9]{4}\s[a-zA-Z0-9 \n/-]*:[a-zA-Z0-9 \n/-]*:[A-Za-z0-9 \n\./-]*DEFENDANT\s{1,23}\n""", '', body)
    
    #Since the formatting is awful, manually remove the page break symbol '\n\x0c'
    body = body.replace('\n\x0c','')
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last two case. They're just the total case counts from the report
    cases.pop()
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Save the last date as the case file date
                temp_dict['File Date'] = dispo_dates_list[-1]

                #Now get the length of the list and remove the last date from the dispo dates list
                file_date_starting_line = len(dispo_dates_list)
                dispo_dates_list.pop()
                dispo_count += len(dispo_dates_list)
                
                #Add disposed dates to temp_dict
                temp_dict['Disposed Dates'] = dispo_dates_list

                temp_dict['Dispositions'] = disposition_list[:file_date_starting_line - 1]

                #temp_dict['Causes of Action'] = coa_list[:file_date_starting_line - 1]

                #temp_dict['Plaintiffs'] = disposition_list[file_date_starting_line - 1:]

                #temp_dict['Defendants'] = coa_list[file_date_starting_line - 1:]

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            dispo_dates_list = []
            disposition_list = []
            #coa_list = []
            
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[:17].strip()

            #Get the first dispo date
            dispo_dates_list.append(line[17:29].strip())

            #Get first disposition
            disposition_list.append(line[29:64].strip())

            #Get first coa
            #coa_list.append(line[64:].strip())

            #End of line, so move to next one

        else:
            #Get additional dispo date
            dispo_date = line[17:29].strip()

            #Check if dispo_date is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if dispo_date.isspace() == False and len(dispo_date) > 0:
                dispo_dates_list.append(dispo_date.strip())

            #Get additional disposition
            disposition = line[29:64].strip()

            #Check if disposition is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if disposition.isspace() == False and len(disposition) > 0:
                disposition_list.append(disposition.strip())

            #Get additional coa
            #coa = line[64:].strip()

            #Check if coa is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            #if coa.isspace() == False and len(coa) > 0:
                #coa_list.append(coa.strip())

            #End of line

    #Check that the last case was added to the list
    #If not, add it
    #Save the last date as the case file date
    temp_dict['File Date'] = dispo_dates_list[-1]

    #Now get the length of the list and remove the last date from the dispo dates list
    file_date_starting_line = len(dispo_dates_list)
    dispo_dates_list.pop()
    dispo_count += len(dispo_dates_list)
    
    #Add disposed dates to temp_dict
    temp_dict['Disposed Dates'] = dispo_dates_list

    temp_dict['Dispositions'] = disposition_list[:file_date_starting_line - 1]

    #temp_dict['Causes of Action'] = coa_list[:file_date_starting_line - 1]

    #temp_dict['Plaintiffs'] = disposition_list[file_date_starting_line - 1:]

    #temp_dict['Defendants'] = coa_list[file_date_starting_line - 1:]

    #Add temp dict data to case_list
    case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    print(f'There were {dispo_count} dispositions.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    #Add the as of date
    df['Disposed As Of Date'] = report_as_of_date
    
    return df

# Test Function

In [198]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Disposed_Case_Reports/Disposed CV Zavala 01012019 thru 08242023.pdf'

In [199]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [200]:
df = extract_civil_disposed_cases(text)

Collected Data From 657 Cases.
There were 657 dispositions.


In [201]:
df

Unnamed: 0,County,Cause Number,File Date,Disposed Dates,Dispositions,Disposed As Of Date
0,Zavala,18-07-14202-ZCV,07/06/2018,[01/02/2019],[CASES NON-SUITED OR DISMISSED BY],08/24/23
1,Zavala,03-03-10656-ZCV,06/25/2018,[01/14/2019],[AGREED JUDGMENTS],08/24/23
2,Zavala,13-12-1804-TX,12/23/2013,[01/14/2019],[CASES DISMISSED FOR WANT OF PROSE],08/24/23
3,Zavala,14-02-1850-TX,02/27/2014,[01/14/2019],[ALL OTHER DISPOSITIONS],08/24/23
4,Zavala,18-08-14226-ZCV,08/01/2018,[01/14/2019],[AGREED JUDGMENTS],08/24/23
...,...,...,...,...,...,...
652,Zavala,23-04-15150-ZCV,04/26/2023,[08/03/2023],[AGREED JUDGMENTS],08/24/23
653,Zavala,96-06-09380-ZCV,08/03/2023,[08/03/2023],[AGREED JUDGMENTS],08/24/23
654,Zavala,20-09-14634-ZCV,09/15/2020,[08/04/2023],[CASES NON-SUITED OR DISMISSED BY],08/24/23
655,Zavala,11-05-12494-ZCV,05/04/2011,[08/14/2023],[ALL OTHER DISPOSITIONS],08/24/23


In [184]:
df[df['Cause Number'] == '19-09-37842-MCV']

Unnamed: 0,County,Cause Number,File Date,Disposed Dates,Dispositions
2799,Maverick,19-09-37842-MCV,09/03/2019,"[05/09/2022, 04/11/2023]","[CASES NON-SUITED OR DISMISSED BY, CASES NON-S..."
