In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

#Custom module
import jsmith_acquire

# Read In Criminal Case PDF

Start with just the first page. Break it down and gather the available info.

In [2]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Files/June 5_Zavala Criminal _ DETAILED Pending Rpt.pdf'

In [3]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, maxpages = 1, check_extractable=True):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)

                                     CRIMINAL DETAILED PENDING CASES REPORT - 06/07/2023
                                    ZAVALA COUNTY - DISTRICT CLERK - 293RD DISTRICT COURT
                              FILED PENDING CASES AS OF 06/05/2023 - SORTED BY FILE DATE - PAGE 1

CAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS
   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       

96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      
                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       
07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    
   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         
09-0

In [4]:
print(repr(text))

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 06/07/2023\n                                    ZAVALA COUNTY - DISTRICT CLERK - 293RD DISTRICT COURT\n                              FILED PENDING CASES AS OF 06/05/2023 - SORTED BY FILE DATE - PAGE 1\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       \n\n96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD   

In [5]:
#Separate the header from the body
header = text[:517]
header

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 06/07/2023\n                                    ZAVALA COUNTY - DISTRICT CLERK - 293RD DISTRICT COURT\n                              FILED PENDING CASES AS OF 06/05/2023 - SORTED BY FILE DATE - PAGE 1\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       \n"

In [6]:
#body 
body = text[517:]
body

'\n96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009 

In [7]:
#Remove leading and trailing whitespace
body = body.strip()

In [8]:
#Try splitting the cases up on the new lines
cases = body.split('\n')

In [9]:
#How many cases per page? (in the detailed report, there are more new lines than cases)
len(cases)

38

In [10]:
cases

['96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      ',
 '                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       ',
 '07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    ',
 '   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         ',
 '09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      ',
 '                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     ',
 '09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      ',
 '                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    ',
 '03-12-0290

In [11]:
#Just work with the first line for now
case = cases[0]
case

'96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      '

# Gather Info For a Single Case

__Cause Number__

In [12]:
#Gather the cause number
cause_num = case[:22].strip()
cause_num

'96-04-02608-ZCR'

__File Date__

In [13]:
#Gather the file date
file_date = case[22:34].strip()
file_date

'04/19/1996'

__Defendant Name__

In [14]:
defendant_name = case[34:72].strip()
defendant_name

'POMPA-CONTRERAS, JESUS'

__Court__

In [15]:
court = case[72:79].strip()
court

'293'

__Docket Date__

In [16]:
docket_date = case[79:89].strip()
docket_date

'07/12/2023'

__Outstanding Warrants__

In [17]:
#Everything after the docket date
warrants = case[89:].strip()
warrants

''

__Attorney__

In [18]:
#The following information is found on subsequent lines due to the way the report is formatted.
#Use a case with all 3 pieces of info
case = cases[3]
case

'   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         '

In [19]:
attorney = case[:35].strip()
attorney

'PADILLA, ALFREDO Z.'

__First Offense__

In [20]:
offense = case[35:74].strip()
offense

'INDECENCY W/CHILD SEXUAL CONTACT'

__ST RPT Column__

In [21]:
st_rpt = case[74:].strip()
st_rpt

'INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD'

# Load The First Two Pages

Using the first two pages, set up a loop to gather all the info for each case and create a dataframe.

In [22]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, maxpages = 2, check_extractable=True):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [23]:
#Separate the first header from the body
#We'll use this to identify the county later
header = text[:517]

In [24]:
#body
body = text[517:]

In [25]:
#Remove leading and trailing whitespaces from the body text
body = body.strip()

In [26]:
header

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 06/07/2023\n                                    ZAVALA COUNTY - DISTRICT CLERK - 293RD DISTRICT COURT\n                              FILED PENDING CASES AS OF 06/05/2023 - SORTED BY FILE DATE - PAGE 1\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       \n"

In [27]:
body

"96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009  P

In [28]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
body

'96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009  P

In [29]:
#Split the text on the '\n'
cases = body.split('\n')

In [30]:
#Establish a container list for the dictionaries
case_list = []
attorney_names = []
offense_list = []
st_rpt_list = []
temp_dict = {}

In [38]:
for line in cases:
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Attorney'] = attorney_names
            temp_dict['First Offense'] = offense_list
            temp_dict['ST RPT Column'] = st_rpt_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        attorney_names = []
        offense_list = []
        st_rpt_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:22].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[22:34].strip()
        
        #Get defendant name
        temp_dict['Defendant'] = line[34:72].strip()
        
        #Get court
        temp_dict['Court'] = line[72:79].strip()
        
        #Get docket date
        temp_dict['Docket Date'] = line[79:89].strip()
        
        #Get outstanding warrants
        temp_dict['Outstanding Warrants'] = line[89:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get attorney name
        attorney_name = line[:35].strip()
        
        #Check if attorney_name is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if attorney_name.isspace() == False and len(attorney_name) > 0:
            attorney_names.append(attorney_name.strip())
        
        #Get first offense
        offense = line[35:74].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #Get ST RPT Column
        st_rpt = line[74:].strip()
        
        #Check if st_rpt is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if st_rpt.isspace() == False and len(st_rpt) > 0:
            st_rpt_list.append(st_rpt.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Attorney'] = attorney_names
temp_dict['First Offense'] = offense_list
temp_dict['ST RPT Column'] = st_rpt_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [37]:
cases[-2]

'22-11-03963-ZCR       11/09/2022  JIMENEZ, VICTOR HUGO                  293    07/12/2023                      '

In [39]:
case_list

[{'Cause Number': '96-04-02608-ZCR',
  'File Date': '04/19/1996',
  'Defendant': 'POMPA-CONTRERAS, JESUS',
  'Court': '293',
  'Docket Date': '07/12/2023',
  'Outstanding Warrants': '',
  'Attorney': [],
  'First Offense': ['AGG ROBBERY'],
  'ST RPT Column': ['AGGRAVATED ROBBERY OR ROBBERY']},
 {'Cause Number': '07-04-03066-ZCR',
  'File Date': '04/30/2007',
  'Defendant': 'HERRERA, JOSE NICOLAS',
  'Court': '293',
  'Docket Date': '',
  'Outstanding Warrants': '',
  'Attorney': ['PADILLA, ALFREDO Z.'],
  'First Offense': ['INDECENCY W/CHILD SEXUAL CONTACT'],
  'ST RPT Column': ['INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD']},
 {'Cause Number': '09-02-03164-ZCR',
  'File Date': '03/04/2009',
  'Defendant': 'MIRABAL, JOSE LUIS',
  'Court': '293',
  'Docket Date': '12/01/2023',
  'Outstanding Warrants': '',
  'Attorney': [],
  'First Offense': ['POSS MARIJ >2,000LBS'],
  'ST RPT Column': ['DRUG POSSESSION']},
 {'Cause Number': '09-02-03170-ZCR',
  'File Date': '03/05/2009',
  'Defendant'

In [40]:
#Create dataframe
df = pd.DataFrame(case_list)

In [41]:
df

Unnamed: 0,Cause Number,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
0,96-04-02608-ZCR,04/19/1996,"POMPA-CONTRERAS, JESUS",293,07/12/2023,,[],[AGG ROBBERY],[AGGRAVATED ROBBERY OR ROBBERY]
1,07-04-03066-ZCR,04/30/2007,"HERRERA, JOSE NICOLAS",293,,,"[PADILLA, ALFREDO Z.]",[INDECENCY W/CHILD SEXUAL CONTACT],[INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD]
2,09-02-03164-ZCR,03/04/2009,"MIRABAL, JOSE LUIS",293,12/01/2023,,[],"[POSS MARIJ >2,000LBS]",[DRUG POSSESSION]
3,09-02-03170-ZCR,03/05/2009,"GONZALEZ, GISEL ALEJANDRA",293,12/01/2023,,[],[POSS MARIJ <2OZ],[ALL MISDEMEANORS]
4,03-12-02906-ZCR,12/30/2009,"PUENTE, LISA MARIE",293,07/12/2023,,[],[BURGLARY OF HABITATION],[BURGLARY]
...,...,...,...,...,...,...,...,...,...
67,22-08-03922-ZCR,08/10/2022,"MONCADA, GEOFFREY",293,07/12/2023,,"[RODRIGUEZ JR., TED]",[POSS CS PG 1 >=4G<200G],[DRUG POSSESSION]
68,22-08-03923-ZCR,08/10/2022,"MONCADA, GEOFFREY",293,07/12/2023,,[],[EVADING ARREST DET W/VEH],[OTHER FELONIES]
69,22-11-03956-ZCR,11/09/2022,"SANCHEZ, PEDRO",293,07/10/2023,,[],[EVADING ARREST DET W/VEH],[OTHER FELONIES]
70,22-11-03962-ZCR,11/09/2022,"JIMENEZ, VICTOR HUGO",293,07/12/2023,,[],[AGG ASSAULT CAUSES SERIOUS BODILY INJ],[AGGRAVATED ASSAULT OR ATTEMPTED MURDER]


__Remove MTR Case Counts__

The bottom of the report has a count of all cases and includes more info on MTR-A cases.

We don't need it, so use regex to remove it.

In [19]:
#Read in the whole document

#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [20]:
#Remove the header
header = text[:517]
body = text[517:]

In [21]:
#Remove subsequent headers using regex
body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
body

'\n96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009 

In [22]:
#Remove leading and trailing whitespaces from the body text
body = body.strip()

In [23]:
#Now try removing the last section based on the number of characters
new_body = re.sub(r"""\nTOTAL NUMBER OF CASES FILED: [0-9\n-]*MTR-A[A-Z\n ]*[-]*""", '', body)
new_body

'96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009  P

In [24]:
new_body = re.sub(r"""\nTOTAL NUMBER OF MTR-A FILINGS: [0-9\n-]*ALL OTHER CASES ADDED/APPEALED[\n-]*""", '', new_body)
new_body

'96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009  P

In [25]:
new_body = re.sub(r"""\nTOTAL NUMBER OF CASES ADDED/APPEALED: [0-9]*""", '', new_body)
new_body

'96-04-02608-ZCR       04/19/1996  POMPA-CONTRERAS, JESUS                293    07/12/2023                      \n                                   AGG ROBBERY                            AGGRAVATED ROBBERY OR ROBBERY                       \n07-04-03066-ZCR       04/30/2007  HERRERA, JOSE NICOLAS                 293                                    \n   PADILLA, ALFREDO Z.             INDECENCY W/CHILD SEXUAL CONTACT       INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD         \n09-02-03164-ZCR       03/04/2009  MIRABAL, JOSE LUIS                    293    12/01/2023                      \n                                   POSS MARIJ >2,000LBS                   DRUG POSSESSION                                     \n09-02-03170-ZCR       03/05/2009  GONZALEZ, GISEL ALEJANDRA             293    12/01/2023                      \n                                   POSS MARIJ <2OZ                        ALL MISDEMEANORS                                    \n03-12-02906-ZCR       12/30/2009  P

In [26]:
#Split the text on the '\n'
cases = new_body.split('\n')

In [27]:
#Establish a container list for the dictionaries
case_list = []
attorney_names = []
offense_list = []
st_rpt_list = []
temp_dict = {}

In [28]:
len(cases[0])

111

In [29]:
for line in cases:
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Attorney'] = attorney_names
            temp_dict['First Offense'] = offense_list
            temp_dict['ST RPT Column'] = st_rpt_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        attorney_names = []
        offense_list = []
        st_rpt_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:22].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[22:34].strip()
        
        #Get defendant name
        temp_dict['Defendant'] = line[34:72].strip()
        
        #Get court
        temp_dict['Court'] = line[72:79].strip()
        
        #Get docket date
        temp_dict['Docket Date'] = line[79:89].strip()
        
        #Get outstanding warrants
        temp_dict['Outstanding Warrants'] = line[89:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get attorney name
        attorney_name = line[:35].strip()
        
        #Check if attorney_name is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if attorney_name.isspace() == False and len(attorney_name) > 0:
            attorney_names.append(attorney_name.strip())
        
        #Get first offense
        offense = line[35:74].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #Get ST RPT Column
        st_rpt = line[74:].strip()
        
        #Check if st_rpt is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if st_rpt.isspace() == False and len(st_rpt) > 0:
            st_rpt_list.append(st_rpt.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Attorney'] = attorney_names
temp_dict['First Offense'] = offense_list
temp_dict['ST RPT Column'] = st_rpt_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [30]:
#Create dataframe
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
0,96-04-02608-ZCR,04/19/1996,"POMPA-CONTRERAS, JESUS",293,07/12/2023,,[],[AGG ROBBERY],[AGGRAVATED ROBBERY OR ROBBERY]
1,07-04-03066-ZCR,04/30/2007,"HERRERA, JOSE NICOLAS",293,,,"[PADILLA, ALFREDO Z.]",[INDECENCY W/CHILD SEXUAL CONTACT],[INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD]
2,09-02-03164-ZCR,03/04/2009,"MIRABAL, JOSE LUIS",293,12/01/2023,,[],"[POSS MARIJ >2,000LBS]",[DRUG POSSESSION]
3,09-02-03170-ZCR,03/05/2009,"GONZALEZ, GISEL ALEJANDRA",293,12/01/2023,,[],[POSS MARIJ <2OZ],[ALL MISDEMEANORS]
4,03-12-02906-ZCR,12/30/2009,"PUENTE, LISA MARIE",293,07/12/2023,,[],[BURGLARY OF HABITATION],[BURGLARY]
5,04-06-02918-ZCR,01/26/2010,"RIOS, OSVALDO",293,,,[],[BURGLARY OF BUILDING],[BURGLARY]
6,04-06-02920-ZCR,01/29/2010,"RIOS, OSVALDO",293,,,"[BAGLEY, MICHAEL J.]",[BURGLARY OF BUILDING],[BURGLARY]
7,15-03-03456-ZCR,03/25/2015,"MONTANEZ, ROSALLINDA",293,,,[],[EVADING ARREST DET W/VEH],[OTHER FELONIES]
8,16-04-03622-ZCR,04/22/2016,"JIMENEZ, VICTOR HUGO",293,08/01/2023,,"[TORRES, GREGORY D.]",[SEXUAL ASSAULT],[SEXUAL ASSAULT OF AN ADULT]
9,16-04-03623-ZCR,04/22/2016,"JIMENEZ, VICTOR HUGO",293,08/01/2023,,"[TORRES, GREGORY D.]",[AGG SEXUAL ASSAULT],[SEXUAL ASSAULT OF AN ADULT]


# Build Function

Create a function to gather the info from the entire document.

In [73]:
def extract_criminal_pdf_data(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered case info
    """
    
    #Initialize containers
    case_list = []
    attorney_names = []
    offense_list = []
    st_rpt_list = []
    temp_dict = {}
    
    #Separate the first header from the body
    #We'll use this to identify the county later
    header = text[:517]
    
    #Body
    body = text[517:]
    
    #Remove leading and trailing whitespaces from the body text
    body = body.strip()
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if the name of the district clerk changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
    
    #########################################################################################################
    #Now remove the last divider sections using regex
    body = re.sub(r"""\nTOTAL NUMBER OF CASES FILED: [0-9\n-]*MTR-A[A-Z\n ]*[-]*""", '', body)
    
    body = re.sub(r"""\nTOTAL NUMBER OF MTR-A FILINGS: [0-9\n-]*ALL OTHER CASES ADDED/APPEALED[\n-]*""", '', body)
    
    body = re.sub(r"""\nTOTAL NUMBER OF CASES ADDED/APPEALED: [0-9- a-zA-Z\.#;,:'=\n]*""", '', body)
    
    #########################################################################################################
    
    #Split the text on the '\n' to isolate each case
    cases = body.split('\n')
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                #temp_dict['Attorney'] = attorney_names
                temp_dict['First Offense'] = offense_list
                temp_dict['ST RPT Column'] = st_rpt_list

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            #attorney_names = []
            offense_list = []
            st_rpt_list = []
            
            #Assign county
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[:22].strip()

            #Gather the file date
            temp_dict['File Date'] = line[22:34].strip()

            #As of 13 June 2023, we are no longer collecting names
            #Get defendant name
            #temp_dict['Defendant'] = line[34:72].strip()
        
            #Get court
            temp_dict['Court'] = line[72:79].strip()

            #Get docket date
            temp_dict['Docket Date'] = line[79:89].strip()

            #Get outstanding warrants
            temp_dict['Outstanding Warrants'] = line[89:].strip()

            #End of line, so move to next one

        else:
            #As of 13 June 2023, we are no longer collecting names
            
            #Get attorney name
            #attorney_name = line[:35].strip()

            #Check if attorney_name is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            #if attorney_name.isspace() == False and len(attorney_name) > 0:
            #    attorney_names.append(attorney_name.strip())

            #Get first offense
            offense = line[35:74].strip()

            #Check if offense is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            if offense.isspace() == False and len(offense) > 0:
                offense_list.append(offense.strip())

            #Get ST RPT Column
            st_rpt = line[74:].strip()

            #Check if st_rpt is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            if st_rpt.isspace() == False and len(st_rpt) > 0:
                st_rpt_list.append(st_rpt.strip())

            #End of line
        
    #Check that the last case was added to the list
    #If not, add it
    #Add list info to temp_dict
    #temp_dict['Attorney'] = attorney_names
    temp_dict['First Offense'] = offense_list
    temp_dict['ST RPT Column'] = st_rpt_list

    #Add temp dict data to case_list
    case_list.append(temp_dict)

    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    return df

# Test Function

In [92]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/June 7_Zavala Criminal Detailed.pdf'

In [93]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [94]:
df = extract_criminal_pdf_data(text)

Collected Data From 51 Cases.


In [95]:
df.iloc[-1]['First Offense']

['DRIVING WHILE INTOXICATED 3RD OR MORE', 'IAT']

In [96]:
df.iloc[-1]['ST RPT Column']

['FELONY DWI']

In [97]:
df

Unnamed: 0,County,Cause Number,File Date,Court,Docket Date,Outstanding Warrants,First Offense,ST RPT Column
0,Zavala,96-04-02608-ZCR,04/19/1996,293,07/12/2023,,[AGG ROBBERY],[AGGRAVATED ROBBERY OR ROBBERY]
1,Zavala,07-04-03066-ZCR,04/30/2007,293,,,[INDECENCY W/CHILD SEXUAL CONTACT],[INDECENCY WITH OR SEXUAL ASSAULT OF A CHILD]
2,Zavala,09-02-03164-ZCR,03/04/2009,293,12/01/2023,,"[POSS MARIJ >2,000LBS]",[DRUG POSSESSION]
3,Zavala,09-02-03170-ZCR,03/05/2009,293,12/01/2023,,[POSS MARIJ <2OZ],[ALL MISDEMEANORS]
4,Zavala,03-12-02906-ZCR,12/30/2009,293,07/12/2023,,[BURGLARY OF HABITATION],[BURGLARY]
5,Zavala,04-06-02918-ZCR,01/26/2010,293,,,[BURGLARY OF BUILDING],[BURGLARY]
6,Zavala,04-06-02920-ZCR,01/29/2010,293,,,[BURGLARY OF BUILDING],[BURGLARY]
7,Zavala,15-03-03456-ZCR,03/25/2015,293,,,[EVADING ARREST DET W/VEH],[OTHER FELONIES]
8,Zavala,16-04-03622-ZCR,04/22/2016,293,08/01/2023,,[SEXUAL ASSAULT],[SEXUAL ASSAULT OF AN ADULT]
9,Zavala,16-04-03623-ZCR,04/22/2016,293,08/01/2023,,[AGG SEXUAL ASSAULT],[SEXUAL ASSAULT OF AN ADULT]
