In [1]:
import pandas as pd
import re

from pdfminer.high_level import extract_text

In [2]:
test_file = '/Users/johnathonsmith/Downloads/25_03_25CVMaverickInactive.pdf'

In [7]:
content = extract_text(test_file)
content[:350]

"                                 LEOPOLDO VIELMA - DISTRICT CLERK'S OFFICE\n                                PENDING CIVIL CASES - INACTIVITY REPORT - RAN ON 03/26/2025\n                               FILE DATES: ALL FILE DATES - INACTIVE ON: 03/25/2025 - PAGE 1\n\nCAUSE #             FILE DATE   INACTIVE ST DT  INACTIVE END DT  INACTIVE REASON         "

In [9]:
header = content[:350].strip()
header

"LEOPOLDO VIELMA - DISTRICT CLERK'S OFFICE\n                                PENDING CIVIL CASES - INACTIVITY REPORT - RAN ON 03/26/2025\n                               FILE DATES: ALL FILE DATES - INACTIVE ON: 03/25/2025 - PAGE 1\n\nCAUSE #             FILE DATE   INACTIVE ST DT  INACTIVE END DT  INACTIVE REASON"

In [10]:
body = content[350:].strip()
body

'18-11-36606-MCV    11/09/2018       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        22-09-41428-MCV    09/30/2022       10/16/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        23-10-42622-MCV    10/02/2023       02/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        24-02-43040-MCV    02/05/2024       02/28/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        24-06-43488-MCV    06/05/2024       10/24/0224                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        24-06-43494-MCV    06/07/2024       07/19/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        24-09-43842-MCV    09/27/2024       01/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n        24-10-43944-MCV    10/29/2024       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    \n\nTOTAL CASES LISTED: 8'

In [16]:
#Split cases
cases = []
cases = body.split('\n')
cases

['18-11-36606-MCV    11/09/2018       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        22-09-41428-MCV    09/30/2022       10/16/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        23-10-42622-MCV    10/02/2023       02/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-02-43040-MCV    02/05/2024       02/28/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43488-MCV    06/05/2024       10/24/0224                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43494-MCV    06/07/2024       07/19/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-09-43842-MCV    09/27/2024       01/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-10-43944-MCV    10/29/2024       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '',
 'TOTAL CASES LISTED: 8']

In [17]:
#Remove cases that happen to be empty or consist of whitespace only
cases = [case for case in cases if case.isspace() == False and len(case) > 0]
cases

['18-11-36606-MCV    11/09/2018       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        22-09-41428-MCV    09/30/2022       10/16/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        23-10-42622-MCV    10/02/2023       02/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-02-43040-MCV    02/05/2024       02/28/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43488-MCV    06/05/2024       10/24/0224                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43494-MCV    06/07/2024       07/19/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-09-43842-MCV    09/27/2024       01/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-10-43944-MCV    10/29/2024       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 'TOTAL CASES LISTED: 8']

In [18]:
#Drop the last part, it's just a count of total cases
cases.pop()
cases

['18-11-36606-MCV    11/09/2018       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        22-09-41428-MCV    09/30/2022       10/16/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        23-10-42622-MCV    10/02/2023       02/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-02-43040-MCV    02/05/2024       02/28/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43488-MCV    06/05/2024       10/24/0224                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-06-43494-MCV    06/07/2024       07/19/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-09-43842-MCV    09/27/2024       01/10/2025                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ',
 '        24-10-43944-MCV    10/29/2024       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE    ']

In [20]:
#As you access each case for the info, be sure to strip it first
case = cases[0].strip()
case

'18-11-36606-MCV    11/09/2018       12/09/2024                   ALL OTHER CASES ORDER ABATED BY A JUDGE'

In [24]:
#Get cause number
cause_number = case[:19].strip()
cause_number

'18-11-36606-MCV'

In [28]:
#Get file date
file_date = case[19:36].strip()
file_date

'11/09/2018'

In [34]:
#Get inactivity start date
#Since I don't have an example for the next date in the string,
#just capture the 10 characters needed for the date, nothing more
inactive_date = case[36:46].strip()
inactive_date

'12/09/2024'

In [37]:
#Get the reactivated date
#Again, no example for this, so just select everything from the end
#of the last date to the beginning of the next field and strip it
reactivated_date = case[46:65].strip()
reactivated_date

''

In [39]:
#Get the inactive reason
reason = case[65:].strip()
reason

'ALL OTHER CASES ORDER ABATED BY A JUDGE'

In [74]:
def build_inactive_cases_dataframe(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire inactive cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered inactive case info
    """
    
    #Initialize containers
    case_list = []
    temp_dict = {}
    
    #Separate the first header from the body
    #We'll use this to identify the county later
    header = text[:350]

    #Use regex to find the 'AS OF' and 'RAN ON' dates
    dates = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{4}", header)

    #For 'AS OF' date:
    report_as_of_date = dates[1]
    
    #For county, check the name at the beginning of the header
    if header.count('LEOPOLDO VIELMA') >= 1:
        county = 'Maverick'
    elif header.count('MARICELA G. GONZALEZ') >= 1:
        county = 'Dimmit'
    elif header.count('RACHEL P. RAMIREZ') >= 1:
        county = 'Zavala'
    else:
        county = 'Unknown'
    
    #Body
    body = text[350:]
    
    #Remove leading and trailing whitespaces from the body text
    body = body.strip()
    
    #Split the text on the '\n' to isolate each case
    cases = body.split('\n')
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Check the case count
    num_cases = cases.pop()
    num_cases = num_cases[19:].strip()
    
    #If there are zero inactive cases on the report, return
    if num_cases == '0':
        return pd.DataFrame()
    
    for case in cases:
        
        #Strip the case string
        case = case.strip()
        
        #Assign county
        temp_dict['County'] = county

        #Gather the cause number
        temp_dict['Cause Number'] = case[:19].strip()

        #Gather the file date
        temp_dict['File Date'] = case[19:36].strip()

        #Get inactive date
        temp_dict['Inactive Date'] = case[36:46].strip()

        #Get reactivated date
        temp_dict['Reactivated Date'] = case[46:65].strip()
        
        #Assign Status
        temp_dict['Status'] = 'Inactive'

        #Get inactive reason
        temp_dict['Inactive Reason'] = case[65:].strip()
        
        case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}

        #End of line, so move to next one
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)

    #Add 'Report Generated Date', 'Original As Of Date', 'Last As Of Date', and 'Comments' columns
    df["Original As Of Date"] = report_as_of_date
    df["Last As Of Date"] = report_as_of_date
    
    return df

In [54]:
#Test function
df = build_inactive_cases_dataframe(content)
df

Collected Data From 8 Cases.


Unnamed: 0,Cause Number,File Date,Inactive Date,Reactivated Date,Inactive Reason,Original As Of Date,Last As Of Date
0,18-11-36606-MCV,11/09/2018,12/09/2024,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
1,22-09-41428-MCV,09/30/2022,10/16/2024,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
2,23-10-42622-MCV,10/02/2023,02/10/2025,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
3,24-02-43040-MCV,02/05/2024,02/28/2025,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
4,24-06-43488-MCV,06/05/2024,10/24/0224,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
5,24-06-43494-MCV,06/07/2024,07/19/2024,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
6,24-09-43842-MCV,09/27/2024,01/10/2025,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025
7,24-10-43944-MCV,10/29/2024,12/09/2024,,ALL OTHER CASES ORDER ABATED BY A JUDGE,03/25/2025,03/25/2025


In [55]:
import pandas as pd
import re

from pdfminer.high_level import extract_text

In [68]:
#Test with a file that has zero cases
test_file = '/Users/johnathonsmith/Downloads/25_03_25CVDimmitInactive.pdf'

In [72]:
content = extract_text(test_file)

In [75]:
#Test function
df = build_inactive_cases_dataframe(content)
df

Collected Data From 4 Cases.


Unnamed: 0,County,Cause Number,File Date,Inactive Date,Reactivated Date,Status,Inactive Reason,Original As Of Date,Last As Of Date
0,Dimmit,14-10-12387-DCVAJ,10/07/2014,07/26/2017,,Inactive,CASES IN WHICH A NOTICE OR SUGGESTION OF BA,03/25/2025,03/25/2025
1,Dimmit,19-04-13643-DCVAJ,04/10/2019,05/10/2019,,Inactive,CASES IN WHICH A NOTICE OR SUGGESTION OF BA,03/25/2025,03/25/2025
2,Dimmit,20-08-03124-DTX,08/28/2020,03/03/2025,,Inactive,"CASES PLACED IN ""HOLD FOR JUDGMENT/SETTLEME",03/25/2025,03/25/2025
3,Dimmit,20-11-03148-DTX,11/28/2020,09/22/2022,,Inactive,,03/25/2025,03/25/2025
