In [1]:
import pandas as pd
import fitz #PyMuPDF
import re

from pdfminer.high_level import extract_text

In [2]:
test_file = '/Users/johnathonsmith/Downloads/400TH PENDING CRIMINAL.pdf'

In [3]:
#open the pdf file
document = fitz.open(test_file)

In [4]:
page = document.load_page(0)

In [5]:
text = page.get_text('text')
text

'Case Number\nLocation\nJudicial Officer\nCase\nCategory\nCase Type\nStyle\nCase Status\nCase Subtype\nOffense\n00-DCR-033629\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Michael Samuel\nPending - Inactive\nINDECENCY WITH A CHILD/F-2 (03/25/2000)\n01-DCR-035105\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Jesus Garcia Aguilar\nPending - Inactive\nINDECENCY WITH A CHILD/F-2 (06/17/2001)\n04-DCR-039827\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Robert Penaloza\nPending - Inactive\nSEXUAL ASSAULT OF A CHILD/F-2 (11/15/2003)\n05-DCR-043199\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Jaime Martell Ponce\nPending - Inactive\nAGG ASSAULT BY/AGAINST PUB\nSERV/WITNESS/INFORMENT / F1 (10/13/200

In [24]:
#Cut off the end of the page after the last case's info
text = text[:text.find('\nCase Filed Date\nDate Range Type:\n')]
text

'Case Number\nLocation\nJudicial Officer\nCase\nCategory\nCase Type\nStyle\nCase Status\nCase Subtype\nOffense\n00-DCR-033629\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Michael Samuel\nPending - Inactive\nINDECENCY WITH A CHILD/F-2 (03/25/2000)\n01-DCR-035105\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Jesus Garcia Aguilar\nPending - Inactive\nINDECENCY WITH A CHILD/F-2 (06/17/2001)\n04-DCR-039827\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Robert Penaloza\nPending - Inactive\nSEXUAL ASSAULT OF A CHILD/F-2 (11/15/2003)\n05-DCR-043199\n400th Judicial\nDistrict Court\nKrenek, Edward\nM.\nCriminal\nAdult Felony - Filed by Indictment\nThe State of Texas vs Jaime Martell Ponce\nPending - Inactive\nAGG ASSAULT BY/AGAINST PUB\nSERV/WITNESS/INFORMENT / F1 (10/13/200

In [25]:
#Find all cause numbers
cause_nums = re.findall(r'[0-9]{2}-[DCVR]{3}-[0-9A-Z]*', text)
cause_nums

['00-DCR-033629',
 '01-DCR-035105',
 '04-DCR-039827',
 '05-DCR-043199',
 '08-DCR-048854',
 '08-DCR-050226',
 '09-DCR-051462',
 '09-DCR-053307',
 '11-DCR-058000']

In [8]:
case = text[text.find(cause_nums[0]):text.find(cause_nums[1])]
case = case[case.find('\nCriminal'):].strip()
case = case[case.find('\n'):].strip()
case

'Adult Felony - Filed by Indictment\nThe State of Texas vs Michael Samuel\nPending - Inactive\nINDECENCY WITH A CHILD/F-2 (03/25/2000)'

In [26]:
num_cases = len(cause_nums)
num_cases

9

In [40]:
#Create empty lists to store info
case_types = []
styles = []
statuses = []
offenses = []

single_line_case_types = [
    'Adult Felony - Filed by Indictment',
    'Unindicted Filing'
]

possible_statuses = [
    'Pending - Inactive',
    'Pending -\nUnindicted',
    'Active',
    'Re-Opened',
    'Appealed'
]

In [41]:
#Now loop through each cases info by segmenting the text with the locations of each cause num
for i, cause_num in enumerate(cause_nums):
    #Check if this is the last case in the list
    if i == len(cause_nums) - 1:
        case_info = text[text.find(cause_num):].strip()
    else:
        case_info = text[text.find(cause_num):text.find(cause_nums[i+1])].strip()
    
    #We already have cause numbers
    #Location, Officer, and Category are all static in this report
    #So those will be hardcoded and added later
    case_info = case_info[case_info.find('\nCriminal'):].strip()
    case_info = case_info[case_info.find('\n'):].strip()
    
    #If the string up to the first '\n' matches any of the single line case types, then that is the entire case type
    #If it does not, then the case type is comprised of the strings from the first TWO '\n' characters
    case_type = case_info[:case_info.find('\n')]
    case_info = case_info[case_info.find('\n') + 1:]
    
    if case_type in single_line_case_types:
        case_types.append(case_type.strip())
    else:
        case_types.append(case_type + ' ' + case_info[:case_info.find('\n')].strip())
        case_info = case_info[case_info.find('\n') + 1:]
    
    #Loop through each possible status to identify which one it is and get it's starting position
    #We will use that position to identify the style as well
    for ps in possible_statuses:
        if case_info.find(ps) != -1:
            #Grab the status
            statuses.append(ps)
            #Get the style and remove from case info
            styles.append(case_info[:case_info.find(ps)].strip())
            case_info = case_info[case_info.find(ps):]
            
            if ps == 'Pending -\nUnindicted':
                #Remove text up to the second '\n'
                case_info = case_info[case_info.find('\n'):].strip()
                case_info = case_info[case_info.find('\n'):].strip()
            else:
                #Remove text up to the first '\n'
                case_info = case_info[case_info.find('\n'):].strip()
    
    #At this point, the rest of the string should be the offense
    offenses.append(case_info.strip())

In [42]:
case_types

['Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment',
 'Adult Felony - Filed by Indictment']

In [43]:
statuses

['Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Pending - Inactive',
 'Active']

In [44]:
offenses

['INDECENCY WITH A CHILD/F-2 (03/25/2000)',
 'INDECENCY WITH A CHILD/F-2 (06/17/2001)',
 'SEXUAL ASSAULT OF A CHILD/F-2 (11/15/2003)',
 'AGG ASSAULT BY/AGAINST PUB\nSERV/WITNESS/INFORMENT / F1 (10/13/2005)',
 'AGG ASSAULT W/ DEADLY WEAPON/ F2 (02/20/2008)',
 'INDECENCY W/CHILD SEXUAL CONTACT/ F2\n(09/05/2008)',
 'MAN DEL CS PG 1>=400G/ F* (01/07/2009)',
 'SEX OFFENDERS DUTY TO REGISTER\nLIFE/ANNUALLY (02/17/2009)',
 'THEFT PROP >=$20K<$100K (01/01/2009']

In [45]:
styles

['The State of Texas vs Michael Samuel',
 'The State of Texas vs Jesus Garcia Aguilar',
 'The State of Texas vs Robert Penaloza',
 'The State of Texas vs Jaime Martell Ponce',
 'The State of Texas vs Baldomero Chavez Garcia',
 'The State of Texas vs Juan Fabian Campos-Estrada',
 'The State of Texas vs Victor Hugo Morales',
 'The State of Texas vs Alvaro Javier Silveira',
 'State of Texas vs Rafaila Soto Huerta']

In [61]:
def parse_criminal_report(document):
    """
    This function takes in the text of the PDF report and generates a df.
    The df will still need to be cleaned up, and other static columns added.
    """
    single_line_case_types = [
        'Adult Felony - Filed by Indictment',
        'Unindicted Filing'
    ]

    possible_statuses = [
        'Pending - Inactive',
        'Pending -\nUnindicted',
        'Active',
        'Re-Opened',
        'Appealed'
    ]
    
    #Create an empty df. We will use this one to concat each page's dataframe to
    df = pd.DataFrame()
    
    for page_num, page in enumerate(document):
        #Plus one because page_num starts at zero
        page_num = page_num + 1
        
        #Create empty lists to store info
        case_types = []
        styles = []
        statuses = []
        offenses = []
        offense_dates = []
        plaintiffs = []
        defendants = []
        
        #Rest my_dict
        my_dict = {}
        
        #Get page text
        text = page.get_text('text')
        
        #Cut off the end of the page after the last case's info
        #If last page, remove Total Report Count as well
        if page_num == document.page_count:
            text = text[:text.find('\nTotal Report Count\nCase Filed Date\nDate Range Type:\n')]
            text = text[:text.rfind('\n')]
        else:
            text = text[:text.find('\nCase Filed Date\nDate Range Type:\n')]
        
        #Find all cause numbers on this page
        cause_nums = re.findall(r'[0-9]{2}-[DCVR]{3}-[0-9A-Z]*', text)
        
        #Now loop through each cases info by segmenting the text with the locations of each cause num
        for i, cause_num in enumerate(cause_nums):
            #Check if this is the last case in the list
            if i == len(cause_nums) - 1:
                case_info = text[text.find(cause_num):].strip()
            else:
                case_info = text[text.find(cause_num):text.find(cause_nums[i+1])].strip()

            #We already have cause numbers
            #Location, Officer, and Category are all static in this report
            #So those will be hardcoded and added later
            case_info = case_info[case_info.find('\nCriminal'):].strip()
            case_info = case_info[case_info.find('\n'):].strip()

            #If the string up to the first '\n' matches any of the single line case types, then that is the entire case type
            #If it does not, then the case type is comprised of the strings from the first TWO '\n' characters
            case_type = case_info[:case_info.find('\n')]
            case_info = case_info[case_info.find('\n') + 1:]

            if case_type in single_line_case_types:
                case_types.append(case_type.strip())
            else:
                case_types.append(case_type + ' ' + case_info[:case_info.find('\n')].strip())
                case_info = case_info[case_info.find('\n') + 1:]

            #Loop through each possible status to identify which one it is and get it's starting position
            #We will use that position to identify the style as well
            for ps in possible_statuses:
                if case_info.find(ps) != -1:
                    #Grab the status
                    statuses.append(ps)
                    #Get the style and remove from case info
                    styles.append(case_info[:case_info.find(ps)].strip())
                    case_info = case_info[case_info.find(ps):]

                    if ps == 'Pending -\nUnindicted':
                        #Remove text up to the second '\n'
                        case_info = case_info[case_info.find('\n'):].strip()
                        case_info = case_info[case_info.find('\n'):].strip()
                    else:
                        #Remove text up to the first '\n'
                        case_info = case_info[case_info.find('\n'):].strip()

            #At this point, the rest of the string should be the offense
            offense = case_info.strip()
            
            #Search the offense value for the offense date
            #It should be the entire string after the last '('
            offense_date = offense[offense.rfind('('):]
            offense_date = offense_date.replace('(', '').replace(')','').strip()
            offense_dates.append(offense_date)
            
            #Now remove the date from the offense
            offense = offense[:offense.rfind('(')].strip()
            offenses.append(offense)
            
            #Search the 'style' value for the plaintiff and defendant names
            #It should contain some variation of 'vs'
            if re.search(r'\s{1}vs\s{1}', styles[-1].lower()) != None:
                parties = styles[-1].lower().split(' vs ')
                plaintiffs.append(parties[0].title().strip())
                defendants.append(parties[1].title().strip())
            elif re.search(r'\s{1}vs\.\s{1}', styles[-1].lower()) != None:
                parties = styles[-1].lower().split(' vs. ')
                plaintiffs.append(parties[0].title().strip())
                defendants.append(parties[1].title().strip())
            elif re.search(r'\s{1}v\s{1}', styles[-1].lower()) != None:
                parties = styles[-1].lower().split(' v ')
                plaintiffs.append(parties[0].title().strip())
                defendants.append(parties[1].title().strip())
            elif re.search(r'\s{1}v\.\s{1}', styles[-1].lower()) != None:
                parties = styles[-1].lower().split(' v. ')
                plaintiffs.append(parties[0].title().strip())
                defendants.append(parties[1].title().strip())
            
        #For error checking, verify that each list is the same length. They should all have the same number of values
        num_cases = len(cause_nums)

        if num_cases != len(case_types):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Case Type Values')
            break

        if num_cases != len(styles):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Style Values')
            break

        if num_cases != len(statuses):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Status Values')
            break
            
        if num_cases != len(offenses):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Offense Values')
            break
            
        if num_cases != len(plaintiffs):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Plaintiff Values')
            break
            
        if num_cases != len(defendants):
            print(f"Error on Page {page_num}!")
            print('Num Cases Does Not Equal Number of Defendant Values')
            break
        
        #Create a dictionary. Make the keys the names of the columns, and the values the lists.
        my_dict = {
            'Cause Number': cause_nums,
            'Case Type': case_types,
            'Style': styles,
            'Status': statuses,
            'Plaintiff': plaintiffs,
            'Defendant': defendants,
            'Offense': offenses,
            'Offense Date': offense_dates
        }
        
        page_df = pd.DataFrame(my_dict)
        
        #Once we have the current page's dataframe, add it to the overall dataframe
        df = pd.concat([df,page_df], ignore_index = True)
        
    return df

In [62]:
df = parse_criminal_report(document)
df

Unnamed: 0,Cause Number,Case Type,Style,Status,Plaintiff,Defendant,Offense,Offense Date
0,00-DCR-033629,Adult Felony - Filed by Indictment,The State of Texas vs Michael Samuel,Pending - Inactive,The State Of Texas,Michael Samuel,INDECENCY WITH A CHILD/F-2,03/25/2000
1,01-DCR-035105,Adult Felony - Filed by Indictment,The State of Texas vs Jesus Garcia Aguilar,Pending - Inactive,The State Of Texas,Jesus Garcia Aguilar,INDECENCY WITH A CHILD/F-2,06/17/2001
2,04-DCR-039827,Adult Felony - Filed by Indictment,The State of Texas vs Robert Penaloza,Pending - Inactive,The State Of Texas,Robert Penaloza,SEXUAL ASSAULT OF A CHILD/F-2,11/15/2003
3,05-DCR-043199,Adult Felony - Filed by Indictment,The State of Texas vs Jaime Martell Ponce,Pending - Inactive,The State Of Texas,Jaime Martell Ponce,AGG ASSAULT BY/AGAINST PUB\nSERV/WITNESS/INFOR...,10/13/2005
4,08-DCR-048854,Adult Felony - Filed by Indictment,The State of Texas vs Baldomero Chavez Garcia,Pending - Inactive,The State Of Texas,Baldomero Chavez Garcia,AGG ASSAULT W/ DEADLY WEAPON/ F2,02/20/2008
...,...,...,...,...,...,...,...,...
846,25-DCR-113379,Unindicted Filing,State of Texas vs Adam Elie Gale,Pending -\nUnindicted,State Of Texas,Adam Elie Gale,SOLICIT PROSTIT/OTHER PAYOR,10/30/2025
847,25-DCR-113385,Unindicted Filing,State of Texas vs Carlos Alberto Padron,Pending -\nUnindicted,State Of Texas,Carlos Alberto Padron,SOLICIT PROSTIT/OTHER PAYOR,10/30/2025
848,86-DCR-016183,Adult Felony - Filed by Indictment,The State of Texas vs Gloria Jean Nelson,Active,The State Of Texas,Gloria Jean Nelson,MURDER,11/06/1985
849,87-DCR-017681,Adult Felony - Filed by Indictment,The State of Texas vs Mark Steven Hartsell,Pending - Inactive,The State Of Texas,Mark Steven Hartsell,"THEFT PROP>=$1,500<$20K",06/09/1987


In [63]:
df['Officer'] = 'Krenek, Edward M.'
df['Category'] = 'Criminal'
df['Location'] = '400th District Judicial Court'

In [64]:
df = df[[
    'Cause Number',
    'Location',
    'Officer',
    'Category',
    'Case Type',
    'Style',
    'Plaintiff',
    'Defendant',
    'Offense',
    'Offense Date',
    'Status'
]]

In [65]:
df.to_csv('/Users/johnathonsmith/Downloads/400th_pending_criminal_report.csv', index=False)