In [1]:
import pandas as pd
import re

from pdfminer.high_level import extract_text

In [6]:
#Testing new function that creates a new row for every disposition, not just for every cause number.
#Use cause number 14-10-30152-MCV to test if it is working. This one has 4 different dispositions.
test_file = '/Users/johnathonsmith/Downloads/CaseFilesForTesting/24_12_16_CV_Maverick_Disposed.pdf'

In [7]:
content = extract_text(test_file)
content[:500]

'           DECEMBER 17, 2024 07:25am - MAVERICK DISTRICT CLERK OFFICE - 830-773-2629\n            CIVIL DISPOSED CASES - 01/01/19 TO 12/16/24 - INCLUDE CONFIDENTIAL CASES\n                                       COURT: 293 - PAGE 1\n\nCAUSE NO.        DISPO DT/S  DISPOSITION/S                      CAUSE OF ACTION/REASON REOPENED \n                 FILE DATE   PLAINTIFF                          DEFENDANT                       \n\n18-04-35458-MCV  07/23/2019  P                                  1-DIVORCE -'

In [52]:
#This is the acquire function for civil disposed case files.
def build_civil_disposed_cases_dataframe(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire disposed cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered disposed case info
    """
    
    #Initialize containers
    case_list = []
    dispo_dates_list = []
    disposition_list = []
    temp_dict = {}
    
    #Get the header and remove surrounding whitespace
    header = text[:420].strip()

    #Get the body and remove surrounding whitespace
    body = text[420:].strip()

    #Use regex to find county. This makes it more versatile.
    county = re.findall(r"\s{1}[A-Za-z]* DISTRICT CLERK OFFICE", header)[0][:-22].strip()
    
    #Use regex to get the court.
    court = re.findall(r"COURT: [0-9]*", header)[0][6:].strip()
    
    #Get the report 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{2}", header)[1]
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    #Can't include the page break in this regex because the formatting is awful
    body = re.sub(r"""\s*[A-Z]{3,9}\s[0-9]{1,2},\s[0-9]{4}\s[a-zA-Z0-9 \n/-]*:[a-zA-Z0-9 \n/-]*:[A-Za-z0-9 \n\./-]*DEFENDANT\s{1,23}\n""", '', body)
    
    #Since the formatting is awful, manually remove the page break symbol '\n\x0c'
    body = body.replace('\n\x0c','')
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last two case. They're just the total case counts from the report
    cases.pop()
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Save the cause number and last date as the case file date
                cause_number = temp_dict['Cause Number']
                file_date = dispo_dates_list[-1]
                #temp_dict['File Date'] = dispo_dates_list[-1]

                #Loop through the dispo dates and dispostions to create a separate case for each one.
                #We want each disposition to be individual.
                #Don't iterate through the last date because that is the initial file date for the casue number.
                for i in range(0, (len(dispo_dates_list) - 1)):
                    #For testing
                    if temp_dict['Cause Number'] == '14-10-30152-MCV':
                        print("Case List At Beginning of Loop #: " + str(i))
                        print(case_list[-4:])
                        print()
                        
                    #Reset temp dict
                    temp_dict = {}
                    temp_dict['Cause Number'] = cause_number
                    temp_dict['File Date'] = file_date
                        
                    #Add disposed dates to temp_dict
                    temp_dict['Disposed Date'] = dispo_dates_list[i]
                    #For testing:
                    #if temp_dict['Cause Number'] == '14-10-30152-MCV':
                        #print("Disposed Date #" + str(i) + ": " + dispo_dates_list[i])
                        #print("Temp Dict Disposed Date: " + temp_dict['Disposed Date'])
                    
                    
                    temp_dict['Disposition'] = disposition_list[i]
                    #For testing:
                    if temp_dict['Cause Number'] == '14-10-30152-MCV':
                        #print("Disposition #" + str(i) + ": " + disposition_list[i])
                        #print("Temp Dict Disposition: " + temp_dict['Disposition'])
                        print("Case List Before Append:")
                        print(case_list[-4:])
                        print()

                    #Add temp dict data to case_list
                    case_list.append(temp_dict)
                    #For testing:
                    if temp_dict['Cause Number'] == '14-10-30152-MCV':
                        print("Disposition #" + str(i) + " dictionary:")
                        #print(case_list[-1])
                        print("Case List After Append: ")
                        print(case_list[-4:])
                        print()
                        print("----------------------------------------")
                        
                    #For testing:
                    #if temp_dict['Cause Number'] == '14-10-30152-MCV':
                        #print(pd.DataFrame(data = case_list[-4:], copy = False))

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            dispo_dates_list = []
            disposition_list = []

            #Gather the cause number
            temp_dict['Cause Number'] = line[:17].strip()

            #Get the first dispo date
            dispo_dates_list.append(line[17:29].strip())

            #Get first disposition
            disposition_list.append(line[29:64].strip())

            #End of line, so move to next one

        else:
            #Get additional dispo date
            dispo_date = line[17:29].strip()
            
            #Check if dispo_date is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if dispo_date.isspace() == False and len(dispo_date) > 0:
                dispo_dates_list.append(dispo_date.strip())

            #Get additional disposition
            disposition = line[29:64].strip()

            #Check if disposition is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if disposition.isspace() == False and len(disposition) > 0:
                disposition_list.append(disposition.strip())

            #End of line

    #Check that the last case was added to the list
    #If not, add it
    #Save the last date as the case file date
    temp_dict['File Date'] = dispo_dates_list[-1]
    
    #Loop through the dispo dates and dispostions to create a separate case for each one.
    #We want each disposition to be individual.
    #Don't iterate through the last date because that is the initial file date for the casue number.
    for i in range(0, (len(dispo_dates_list) - 1)):
        
        #Add disposed dates to temp_dict
        temp_dict['Disposed Date'] = dispo_dates_list[i]

        temp_dict['Disposition'] = disposition_list[i]

        #Add temp dict data to case_list
        case_list.append(temp_dict)
    
    #How many dispositions?
    print(f'Collected Data From {len(case_list)} Dispositions.')
    
    #Create dataframe
    df = pd.DataFrame(case_list, copy = False)
    
    #Add the as of date, court, and county columns
    df['As Of Date'] = report_as_of_date
    df['Court'] = court
    df['County'] = county
    
    return df

In [53]:
df = build_civil_disposed_cases_dataframe(content)

Case List At Beginning of Loop #: 0
[{'Cause Number': '21-08-40268-MCV', 'File Date': '08/16/2021', 'Disposed Date': '11/08/2021', 'Disposition': 'AGREED JUDGMENTS'}, {'Cause Number': '97-04-14426-CV', 'File Date': '12/18/2003', 'Disposed Date': '11/08/2021', 'Disposition': 'ALL OTHER DISPOSITIONS'}, {'Cause Number': '20-07-39032-MCV', 'File Date': '07/20/2020', 'Disposed Date': '11/09/2021', 'Disposition': 'CASES DISMISSED FOR WANT OF PROSE'}, {'Cause Number': '21-02-39660-MCV', 'File Date': '02/12/2021', 'Disposed Date': '11/09/2021', 'Disposition': 'ALL OTHER DISPOSITIONS'}]

Case List Before Append:
[{'Cause Number': '21-08-40268-MCV', 'File Date': '08/16/2021', 'Disposed Date': '11/08/2021', 'Disposition': 'AGREED JUDGMENTS'}, {'Cause Number': '97-04-14426-CV', 'File Date': '12/18/2003', 'Disposed Date': '11/08/2021', 'Disposition': 'ALL OTHER DISPOSITIONS'}, {'Cause Number': '20-07-39032-MCV', 'File Date': '07/20/2020', 'Disposed Date': '11/09/2021', 'Disposition': 'CASES DISMISS

In [54]:
df[df['Cause Number'] == '14-10-30152-MCV']

Unnamed: 0,Cause Number,File Date,Disposed Date,Disposition,As Of Date,Court,County
2527,14-10-30152-MCV,10/14/2014,11/10/2021,FINAL JUDGMENTS AFTER NON-JURY TR,12/16/24,293,MAVERICK
2528,14-10-30152-MCV,10/14/2014,07/22/2019,ALL OTHER DISPOSITIONS,12/16/24,293,MAVERICK
2529,14-10-30152-MCV,10/14/2014,02/12/2020,ALL OTHER DISPOSITIONS,12/16/24,293,MAVERICK
2530,14-10-30152-MCV,10/14/2014,11/10/2021,FINAL JUDGMENTS AFTER NON-JURY TR,12/16/24,293,MAVERICK


# Solution Found!

It seems that the problem occurred when I tried updating the temp_dict in each loop iteration without first resetting it. The case_list would be affected when I simply changed two of the fields. This resulted in duplicated cases in the case list. But after including logic to reset the temp_dict at the start of each loop iteration, the case list was no longer affected by temp_dict changes. I'm not really sure why this even happened in the first place, but it might be a pointer issue behind the scenes.