In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

#Custom module
import jsmith_acquire

# Read In Criminal Case PDF

Start with just the first page. Break it down and gather the available info.

In [2]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/CR.PEND_Maverick_1-21-2022.pdf'

In [3]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, maxpages = 1, check_extractable=True):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)

                                                  MAVERICK COUNTY CRIMINAL PENDING REPORT -- PAGE: 1
                                                           LEOPOLDO VIELMA, DISTRICT CLERK
                                                              RUN ON 01/21/2022  09:52am
                                                                   AS OF 01/21/2022

CAUSE #           FILE DATE  DEFENDANT NAME             ATTORNEY         BONDSMAN NAME    OFFENSE DESCRIPTION                  CASE STATUS            
00-05-05160-CR   05/09/2000  LOZANO JR., JUAN JESUS     HERNANDEZ, FELI                   DEADLY CONDUCT DISCHARGE FIREARM     ACTIVE                 
02-02-05382-MCR  02/08/2002  FLORES, MARIA DEL ROSARIO  RHODES, CRAWFOR                   POSS MARIJ >5LBS<=50LBS              ACTIVE                 
03-03-05484-MCR  03/27/2003  FUENTES-GARCIA, OSCAR      RAMON, ALBERTO                    BURGLARY OF HABITATION               ACTIVE                 
03-03-05485-MCR  03/27/2003  F

In [4]:
print(repr(text))

"                                                  MAVERICK COUNTY CRIMINAL PENDING REPORT -- PAGE: 1\n                                                           LEOPOLDO VIELMA, DISTRICT CLERK\n                                                              RUN ON 01/21/2022  09:52am\n                                                                   AS OF 01/21/2022\n\nCAUSE #           FILE DATE  DEFENDANT NAME             ATTORNEY         BONDSMAN NAME    OFFENSE DESCRIPTION                  CASE STATUS            \n00-05-05160-CR   05/09/2000  LOZANO JR., JUAN JESUS     HERNANDEZ, FELI                   DEADLY CONDUCT DISCHARGE FIREARM     ACTIVE                 \n02-02-05382-MCR  02/08/2002  FLORES, MARIA DEL ROSARIO  RHODES, CRAWFOR                   POSS MARIJ >5LBS<=50LBS              ACTIVE                 \n03-03-05484-MCR  03/27/2003  FUENTES-GARCIA, OSCAR      RAMON, ALBERTO                    BURGLARY OF HABITATION               ACTIVE                 \n03-03-05485-MCR  03/

In [5]:
#Separate the header from the body
header = text[:517]
header

'                                                  MAVERICK COUNTY CRIMINAL PENDING REPORT -- PAGE: 1\n                                                           LEOPOLDO VIELMA, DISTRICT CLERK\n                                                              RUN ON 01/21/2022  09:52am\n                                                                   AS OF 01/21/2022\n\nCAUSE #           FILE DATE  DEFENDANT NAME             ATTORNEY         BONDSMAN NAME    OFFENSE DESCRIPTION                  CASE STATUS            \n'

In [6]:
#body 
body = text[517:]
body

"00-05-05160-CR   05/09/2000  LOZANO JR., JUAN JESUS     HERNANDEZ, FELI                   DEADLY CONDUCT DISCHARGE FIREARM     ACTIVE                 \n02-02-05382-MCR  02/08/2002  FLORES, MARIA DEL ROSARIO  RHODES, CRAWFOR                   POSS MARIJ >5LBS<=50LBS              ACTIVE                 \n03-03-05484-MCR  03/27/2003  FUENTES-GARCIA, OSCAR      RAMON, ALBERTO                    BURGLARY OF HABITATION               ACTIVE                 \n03-03-05485-MCR  03/27/2003  FUENTES-GARCIA, OSCAR                                        POSS CS PG 1 >=4G<200G               ACTIVE                 \n04-09-05658-MCR  09/30/2004  MENCHACA, MARTHA           HARPER, JAD P.                    THEFT PROP>=$1500<$20K               ACTIVE                 \n04-10-05690-MCR  10/28/2004  TREVINO, CARLOS JESUS      GONZALEZ, DANIE                   THEFT PROP>=$1500<$20K               ACTIVE                 \n05-05-05774-MCR  05/12/2005  NAVARRO, CARMEN            TORRES, GREGORY                

In [7]:
#Remove leading and trailing whitespace
body = body.strip()

In [8]:
#Try splitting the cases up on the new lines
cases = body.split('\n')

In [9]:
#How many cases per page?
len(cases)

48

In [10]:
cases

['00-05-05160-CR   05/09/2000  LOZANO JR., JUAN JESUS     HERNANDEZ, FELI                   DEADLY CONDUCT DISCHARGE FIREARM     ACTIVE                 ',
 '02-02-05382-MCR  02/08/2002  FLORES, MARIA DEL ROSARIO  RHODES, CRAWFOR                   POSS MARIJ >5LBS<=50LBS              ACTIVE                 ',
 '03-03-05484-MCR  03/27/2003  FUENTES-GARCIA, OSCAR      RAMON, ALBERTO                    BURGLARY OF HABITATION               ACTIVE                 ',
 '03-03-05485-MCR  03/27/2003  FUENTES-GARCIA, OSCAR                                        POSS CS PG 1 >=4G<200G               ACTIVE                 ',
 '04-09-05658-MCR  09/30/2004  MENCHACA, MARTHA           HARPER, JAD P.                    THEFT PROP>=$1500<$20K               ACTIVE                 ',
 '04-10-05690-MCR  10/28/2004  TREVINO, CARLOS JESUS      GONZALEZ, DANIE                   THEFT PROP>=$1500<$20K               ACTIVE                 ',
 '05-05-05774-MCR  05/12/2005  NAVARRO, CARMEN            TORRES, GREG

In [11]:
#Just work with a case that has information in all fields for now
case = cases[8]
case

'08-12-06344-MCR  12/03/2008  HERNANDEZ, MANUEL ANGEL    JUAREZ, EDGAR    ACES BONDING CO  AGG ASSAULT W/DEADLY WEAPON          ACTIVE                 '

In [12]:
#Strip whitespace
case = case.strip()

In [13]:
case

'08-12-06344-MCR  12/03/2008  HERNANDEZ, MANUEL ANGEL    JUAREZ, EDGAR    ACES BONDING CO  AGG ASSAULT W/DEADLY WEAPON          ACTIVE'

# Gather Info For a Single Case

__Cause Number__

In [14]:
#Gather the cause number
cause_num = case[:17].strip()
cause_num

'08-12-06344-MCR'

__File Date__

In [15]:
#Gather the file date
file_date = case[17:29].strip()
file_date

'12/03/2008'

__Defendant Name__

In [16]:
defendant_name = case[29:56].strip()
defendant_name

'HERNANDEZ, MANUEL ANGEL'

__Attorney__

In [17]:
attorney = case[56:72].strip()
attorney

'JUAREZ, EDGAR'

__Bondsman Name__

In [18]:
bondsman = case[72:90].strip()
bondsman

'ACES BONDING CO'

__Offense Description__

In [19]:
offense = case[90:127].strip()
offense

'AGG ASSAULT W/DEADLY WEAPON'

__Case Status__

Not sure we're going to need this, but capture it anyway for now.

In [20]:
status = case[127:].strip()
status

'ACTIVE'

# Load The First Two Pages

Using the first two pages, set up a loop to gather all the info for each case and create a dataframe.

In [21]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, maxpages = 2, check_extractable=True):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [22]:
#Separate the first header from the body
#We'll use this to identify the county later
header = text[:517]

In [23]:
#body
text = text[517:]

In [24]:
#Remove leading and trailing whitespaces from the body text
text = text.strip()

In [25]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
text = re.sub(r"""                 

                                                  .{6,8} COUNTY CRIMINAL PENDING REPORT -- PAGE: \d{1,3}
\w*.*, DISTRICT CLERK
                                                              RUN ON .{19}
                                                                   AS OF .{10}

CAUSE #           FILE DATE  DEFENDANT NAME             ATTORNEY         BONDSMAN NAME    OFFENSE DESCRIPTION                  CASE STATUS""", '', text)

In [32]:
#Split the text on the '\n' to isolate each case
cases = text.split('\n')

In [33]:
#How many? Should be 48 per page. 48 * 2 = 96
len(cases)

96

In [34]:
#Establish a container list for the dictionaries
case_list = []

In [35]:
#Loop through each case. Add case info to temp dict, and then add that to the case list
for case in cases:
    #Create temp_dict
    temp_dict = {}
    
    #Strip leading and trailing whitespace
    case = case.strip()
    
    #Gather the cause number
    cause_num = case[:17].strip()
    
    #Gather the file date
    file_date = case[17:29].strip()
    
    #Get defendant name
    defendant_name = case[29:56].strip()
    
    #Get attorney name
    attorney = case[56:72].strip()
    
    #Get bondsman name
    bondsman = case[72:90].strip()
    
    #Get offense description
    offense = case[90:127].strip()
    
    #Get case status
    status = case[127:].strip()
    
    #Add info to temp dict
    temp_dict['Cause Number'] = cause_num
    temp_dict['File Date'] = file_date
    temp_dict['Defendant Name'] = defendant_name
    temp_dict['Attorney Name'] = attorney
    temp_dict['Bondsman Name'] = bondsman
    temp_dict['Offense'] = offense
    temp_dict['Status'] = status
    
    #Append to case list
    case_list.append(temp_dict)

In [36]:
#Create dataframe
df = pd.DataFrame(case_list)

In [37]:
df

Unnamed: 0,Cause Number,File Date,Defendant Name,Attorney Name,Bondsman Name,Offense,Status
0,00-05-05160-CR,05/09/2000,"LOZANO JR., JUAN JESUS","HERNANDEZ, FELI",,DEADLY CONDUCT DISCHARGE FIREARM,ACTIVE
1,02-02-05382-MCR,02/08/2002,"FLORES, MARIA DEL ROSARIO","RHODES, CRAWFOR",,POSS MARIJ >5LBS<=50LBS,ACTIVE
2,03-03-05484-MCR,03/27/2003,"FUENTES-GARCIA, OSCAR","RAMON, ALBERTO",,BURGLARY OF HABITATION,ACTIVE
3,03-03-05485-MCR,03/27/2003,"FUENTES-GARCIA, OSCAR",,,POSS CS PG 1 >=4G<200G,ACTIVE
4,04-09-05658-MCR,09/30/2004,"MENCHACA, MARTHA","HARPER, JAD P.",,THEFT PROP>=$1500<$20K,ACTIVE
...,...,...,...,...,...,...,...
91,20-02-08082-MCR,02/13/2020,"MARTINEZ-DAVILA, EDGAR OM","LOS SANTOS, LUI",AMIGO BAIL BOND,THEFT PROP >=$30K<$150K,ACTIVE
92,20-02-08084-MCR,02/13/2020,"CASSIAS JR., RICHARD",,CANTU'S BAIL BO,POSS CS PG 1 <1G,ACTIVE
93,20-06-08090-MCR,06/19/2020,"VIGIL, SAMUEL","JUAREZ, EDGAR H",ACES BONDING CO,INJURY CHILD/ELDERLY/DISABLE W/INT,ACTIVE
94,20-06-08092-MCR,06/19/2020,"RODRIGUEZ, ALFREDO","LOS SANTOS, LUI",CANTU'S BAIL BO,"THEFT OF SERV >=$2,500<$30K",ACTIVE


# Build Function

Create a function to gather the info from the entire document.

In [160]:
def extract_criminal_pdf_data(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered case info
    """
    
    #Initialize container list
    case_list = []
    
    #Separate the first header from the body
    #We'll use this to identify the county later
    header = text[:517]
    
    #Body
    body = text[517:]
    
    #Remove leading and trailing whitespaces from the body text
    body = body.strip()
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if the name of the district clerk changes later on
    body = re.sub(r"""\n
                                                  .{6,8} COUNTY CRIMINAL PENDING REPORT -- PAGE: \d{1,3}
                                                        .*, DISTRICT CLERK
                                                              RUN ON .{19}
                                                                   AS OF .{10}

CAUSE #           FILE DATE  DEFENDANT NAME             ATTORNEY         BONDSMAN NAME    OFFENSE DESCRIPTION                  CASE STATUS""",
    '', body)
    
    
    #########################################################################################################
    #Set up regex to remove the MTR/MTA separation
    body = re.sub("""TOTAL FILED CASES: \d{1,4}

MTR/MTA CASES FILED

""", '', body)
    
    
    #########################################################################################################
    #Set up regex to remove the case count section at the end
    body = re.sub("""
NUMBER OF MTR/MTA CASES: \d{1,4}

.*\d{1,4}
.*\d{1,4}
------------------------------
.*\d{1,4}""", '', body)
    
    #########################################################################################################
    
    #Split the text on the '\n' to isolate each case
    cases = body.split('\n')
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each case. Add case info to temp dict, and then add that to the case list
    for case in cases:
        #Create temp_dict
        temp_dict = {}

        #Verify this is a valid case. Check for cause number
        if case[:17].isspace():
            continue
        
        #Strip leading and trailing whitespace
        case = case.strip()

        #Gather the cause number
        cause_num = case[:17].strip()

        #Gather the file date
        file_date = case[17:29].strip()

        #Get defendant name
        defendant_name = case[29:56].strip()

        #Get attorney name
        attorney = case[56:72].strip()

        #Get bondsman name
        bondsman = case[72:90].strip()

        #Get offense description
        offense = case[90:127].strip()

        #Get case status
        status = case[127:].strip()

        #Add info to temp dict
        temp_dict['County'] = county
        temp_dict['Cause Number'] = cause_num
        temp_dict['File Date'] = file_date
        temp_dict['Defendant Name'] = defendant_name
        temp_dict['Attorney Name'] = attorney
        temp_dict['Bondsman Name'] = bondsman
        temp_dict['Offense'] = offense
        temp_dict['Status'] = status

        #Append to case list
        case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    return df

# Test Function

In [161]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [162]:
df = extract_criminal_pdf_data(text)

Collected Data From 78 Cases.


In [163]:
df

Unnamed: 0,County,Cause Number,File Date,Defendant Name,Attorney Name,Bondsman Name,Offense,Status
0,Dimmit,02-10-02242-DCR,10/03/2005,"MONTOYA, ROBERT","CABELLO, RUFINO",,POSS CS PG 1 <1G,
1,Dimmit,10-09-02599-DCR,09/17/2010,"LOZANO, JR., REYMUNDO",,,DRIVING WHILE INTOXICATED 3RD OR MO,
2,Dimmit,12-11-02706-DCR,11/14/2012,"FLOYD, SARAH KRISTINE",,,POSS CS PG 1 <1G,
3,Dimmit,13-10-02816-DCR,10/09/2013,"GONZALEZ, JOSE ANTONIO","RODRIGUEZ, JR.,",,INTOXICATED MANSLAUGHTER W/VEHICLE,
4,Dimmit,13-10-02817-DCR,10/09/2013,"GONZALEZ, JOSE ANTONIO","RODRIGUEZ, JR.,",,INTOXICATED MANSLAUGHTER W/VEHICLE,
...,...,...,...,...,...,...,...,...
73,Dimmit,19-02-03270-DCR,11/06/2019,"CARREON JR., JUAN JOSE","RODRIGUEZ, JR.,",SAENZ BAIL BOND,POSS CS PG 1 <1G,
74,Dimmit,19-08-03300-DCR,11/01/2021,"LONGORIA, SAMUEL SCOTT","PONCE, FRANCISC",,POSS CS PG 1 <1G,
75,Dimmit,19-08-03303-DCR,11/01/2021,"LONGORIA, SAMUEL SCOTT","PONCE, FRANCISC",,POSS CS PG 1 <1G,
76,Dimmit,19-08-03304-DCR,11/01/2021,"LONGORIA, SAMUEL SCOTT","PONCE, FRANCISC",,POSS CS PG 1 <1G,


In [147]:
def build_criminal_cases_dataframe(pdf_path):
    """
    This function reads in the criminal cases pdf document and extracts all available information for each case. 
    It then returns a dataframe.
    
    Parameter:
        - pdf_path: The file path for the pdf to be read.
        
    Returns:
        - df: A dataframe of the resulting case information
    """
    
    #Set up resource manager to handle pdf content. text, images, etc.
    resource_manager = PDFResourceManager()

    #Used to display text
    fake_file_handle = io.StringIO()

    #Set up converter
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

    #Set up page interpreter
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:

        for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
            #Process the current page
            page_interpreter.process_page(page)

        #Save the current page's text to a variable
        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
    
    #Collect criminal case info and get the df
    df = extract_criminal_pdf_data(text)
    
    return df