# Testing Connection to Google Sheets

In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

#Custom module
#import jsmith_acquire

# Read In OLS Case PDF

Start with just the first page. Break it down and gather the available info.

In [35]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/MAV_OLS_Pending.pdf'

In [38]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_number, page in enumerate(PDFPage.get_pages(fh, caching=True, maxpages = 70, check_extractable=True)):
        #Process the current page
        #Only read page 1 for now (index = 0)
        if page_number == 67:
            page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)

                                     CRIMINAL DETAILED PENDING CASES REPORT - 09/12/2024
                                             MAVERICK - COUNTY CLERK - OLS COURT
                              FILED PENDING CASES AS OF 09/12/2024 - SORTED BY FILE DATE - PAGE 68

CAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS
   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       

36624                 08/16/2024  RIOS ROBLES, WILMARY CAROLINA         OLS                                    
   NEUBIG, EMILY                   CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   
36625                 08/16/2024  GARCES CANCHINGRE, ALEX SEBASTION     OLS    09/30/2024                      
   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   
36626       

In [39]:
print(repr(text))

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 09/12/2024\n                                             MAVERICK - COUNTY CLERK - OLS COURT\n                              FILED PENDING CASES AS OF 09/12/2024 - SORTED BY FILE DATE - PAGE 68\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                       \n\n36624                 08/16/2024  RIOS ROBLES, WILMARY CAROLINA         OLS                                    \n   NEUBIG, EMILY                   CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n36625                 08/16/2024  GARCES CANCHINGRE, ALEX SEBASTION     OLS    09/30/2024                      \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n

In [40]:
#Separate the header from the body
header = text[:500]
header

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 09/12/2024\n                                             MAVERICK - COUNTY CLERK - OLS COURT\n                              FILED PENDING CASES AS OF 09/12/2024 - SORTED BY FILE DATE - PAGE 68\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                               "

In [41]:
#body 
body = text[500:]
body

'        \n\n36624                 08/16/2024  RIOS ROBLES, WILMARY CAROLINA         OLS                                    \n   NEUBIG, EMILY                   CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n36625                 08/16/2024  GARCES CANCHINGRE, ALEX SEBASTION     OLS    09/30/2024                      \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n36626                 08/16/2024  MEDINA BETANCOURT, JUNIOR ALEXANDER   OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n36627                 08/16/2024  DA COSTA, FELIPE DAVIDSON             OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n36628                 0

In [42]:
#Remove leading and trailing whitespace
body = body.strip()

In [43]:
#Try splitting the cases up on the new lines
cases = body.split('\n')

In [44]:
#How many cases per page? (in the detailed report, there are more new lines than cases)
len(cases)

38

In [45]:
cases

['36624                 08/16/2024  RIOS ROBLES, WILMARY CAROLINA         OLS                                    ',
 '   NEUBIG, EMILY                   CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   ',
 '36625                 08/16/2024  GARCES CANCHINGRE, ALEX SEBASTION     OLS    09/30/2024                      ',
 '   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   ',
 '36626                 08/16/2024  MEDINA BETANCOURT, JUNIOR ALEXANDER   OLS                                    ',
 '   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   ',
 '36627                 08/16/2024  DA COSTA, FELIPE DAVIDSON             OLS                                    ',
 '   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   ',
 '36628     

In [47]:
#Just work with the first line for now
case = cases[8]
case

'36628                 08/16/2024  QUIROZ BAUTISTA, YULISA AIDEE         OLS    09/30/2024                      '

# Gather Info For a Single Case

__Cause Number__

In [48]:
#Gather the cause number
cause_num = case[:22].strip()
cause_num

'36628'

__File Date__

In [49]:
#Gather the file date
file_date = case[22:34].strip()
file_date

'08/16/2024'

__Defendant Name__

In [50]:
defendant_name = case[34:72].strip()
defendant_name

'QUIROZ BAUTISTA, YULISA AIDEE'

__Court__

In [52]:
court = case[72:79].strip()
court

'OLS'

__Docket Date__

In [53]:
docket_date = case[79:89].strip()
docket_date

'09/30/2024'

__Outstanding Warrants__

In [54]:
#Everything after the docket date
warrants = case[89:].strip()
warrants

''

__Attorney__

In [56]:
#The following information is found on subsequent lines due to the way the report is formatted.
#Use a case with all 3 pieces of info
case = cases[9]
case

'   CHAVEZ, JONATHAN                EVADE ARR DET W/VEH/WTRCFT W/PREV                                                          '

In [57]:
attorney = case[:35].strip()
attorney

'CHAVEZ, JONATHAN'

__First Offense__

In [58]:
offense = case[35:74].strip()
offense

'EVADE ARR DET W/VEH/WTRCFT W/PREV'

__ST RPT Column__

In [59]:
st_rpt = case[74:].strip()
st_rpt

''

# Load The First Two Pages

Using the first two pages, set up a loop to gather all the info for each case and create a dataframe.

In [60]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, maxpages = 2, check_extractable=True):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [61]:
#Separate the first header from the body
#We'll use this to identify the county later
header = text[:500]

In [62]:
#body
body = text[500:]

In [63]:
#Remove leading and trailing whitespaces from the body text
body = body.strip()

In [64]:
header

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 09/12/2024\n                                             MAVERICK - COUNTY CLERK - OLS COURT\n                              FILED PENDING CASES AS OF 09/12/2024 - SORTED BY FILE DATE - PAGE 1\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                "

In [65]:
body

"30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [66]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
body

'30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [67]:
#Split the text on the '\n'
cases = body.split('\n')

In [68]:
#Establish a container list for the dictionaries
case_list = []
attorney_names = []
offense_list = []
st_rpt_list = []
temp_dict = {}

In [69]:
for line in cases:
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Attorney'] = attorney_names
            temp_dict['First Offense'] = offense_list
            temp_dict['ST RPT Column'] = st_rpt_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        attorney_names = []
        offense_list = []
        st_rpt_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:22].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[22:34].strip()
        
        #Get defendant name
        temp_dict['Defendant'] = line[34:72].strip()
        
        #Get court
        temp_dict['Court'] = line[72:79].strip()
        
        #Get docket date
        temp_dict['Docket Date'] = line[79:89].strip()
        
        #Get outstanding warrants
        temp_dict['Outstanding Warrants'] = line[89:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get attorney name
        attorney_name = line[:35].strip()
        
        #Check if attorney_name is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if attorney_name.isspace() == False and len(attorney_name) > 0:
            attorney_names.append(attorney_name.strip())
        
        #Get first offense
        offense = line[35:74].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #Get ST RPT Column
        st_rpt = line[74:].strip()
        
        #Check if st_rpt is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if st_rpt.isspace() == False and len(st_rpt) > 0:
            st_rpt_list.append(st_rpt.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Attorney'] = attorney_names
temp_dict['First Offense'] = offense_list
temp_dict['ST RPT Column'] = st_rpt_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [72]:
cases[0]

'30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    '

In [71]:
case_list

[{'Cause Number': '30778',
  'File Date': '03/28/2022',
  'Defendant': 'HERNANDEZ-GARCIA, EDUARDO',
  'Court': 'OLS',
  'Docket Date': '',
  'Outstanding Warrants': '',
  'Attorney': ['NEIGHBORHOOD DEFENDER SERVICES'],
  'First Offense': ['CRIMINAL TRESPASS'],
  'ST RPT Column': ['OTHER MISDEMEANOR']},
 {'Cause Number': '30868',
  'File Date': '04/25/2022',
  'Defendant': 'AGUILLEN-ARTEAGA, JOSE PEDRO',
  'Court': 'OLS',
  'Docket Date': '',
  'Outstanding Warrants': '',
  'Attorney': ['WINFREY, JACKSON'],
  'First Offense': ['CRIMINAL TRESPASS'],
  'ST RPT Column': ['OTHER MISDEMEANOR']},
 {'Cause Number': '30902',
  'File Date': '05/02/2022',
  'Defendant': 'GONZALEZ-RAMIREZ, JOSUE',
  'Court': 'OLS',
  'Docket Date': '',
  'Outstanding Warrants': '',
  'Attorney': ['EHRENBERG, BRIAN'],
  'First Offense': ['CRIMINAL TRESPASS'],
  'ST RPT Column': ['OTHER MISDEMEANOR']},
 {'Cause Number': '30906',
  'File Date': '05/05/2022',
  'Defendant': 'MOLINA VALENCIA, JUAN ANTONIO',
  'Court': 

In [73]:
#Create dataframe
df = pd.DataFrame(case_list)

In [74]:
df

Unnamed: 0,Cause Number,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
0,30778,03/28/2022,"HERNANDEZ-GARCIA, EDUARDO",OLS,,,[NEIGHBORHOOD DEFENDER SERVICES],[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1,30868,04/25/2022,"AGUILLEN-ARTEAGA, JOSE PEDRO",OLS,,,"[WINFREY, JACKSON]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
2,30902,05/02/2022,"GONZALEZ-RAMIREZ, JOSUE",OLS,,,"[EHRENBERG, BRIAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
3,30906,05/05/2022,"MOLINA VALENCIA, JUAN ANTONIO",OLS,,,"[TEXAS RIO GRANDE LEGAL AID,]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
4,30931,05/12/2022,"TORRES GALVAN, JOSE GUADALUPE",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
5,30932,05/12/2022,"DE LA CRUZ PALACIOS, FRANCISCO",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
6,30933,05/12/2022,"ALCUDIA VENTURA, EUGENIO",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
7,30944,05/12/2022,"TADEO GOMEZ, GUILLERMO",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
8,30945,05/12/2022,"CAC POP, LUIS FERNANDO",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
9,30946,05/12/2022,"RIVERA NUNEZ, JOSE MEDARDO",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]


__Remove MTR Case Counts__

The bottom of the report has a count of all cases and includes more info on MTR-A cases.

We don't need it, so use regex to remove it.

In [75]:
#Read in the whole document

#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [76]:
#Remove the header
header = text[:500]
body = text[500:]

In [77]:
header

"                                     CRIMINAL DETAILED PENDING CASES REPORT - 09/12/2024\n                                             MAVERICK - COUNTY CLERK - OLS COURT\n                              FILED PENDING CASES AS OF 09/12/2024 - SORTED BY FILE DATE - PAGE 1\n\nCAUSE #                  FILE DT  DEFENDANT'S NAME                      COURT   DOCKET DT  OUTSTANDING WARRANTS\n   ACTIVE ATTORNEY/S               1ST OFFENSE                            ST RPT COLUMN                                "

In [78]:
#Remove subsequent headers using regex
body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
body

"       \n\n30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05

In [79]:
#Remove leading and trailing whitespaces from the body text
body = body.strip()

In [80]:
#Now try removing the last section based on the number of characters
new_body = re.sub(r"""\nTOTAL NUMBER OF CASES FILED: [0-9\n-]*MTR-A[A-Z\n ]*[-]*""", '', body)
new_body

"30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [81]:
new_body = re.sub(r"""\nTOTAL NUMBER OF MTR-A FILINGS: [0-9\n-]*ALL OTHER CASES ADDED/APPEALED[\n-]*""", '', new_body)
new_body

"30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [82]:
new_body = re.sub(r"""\nTOTAL NUMBER OF CASES ADDED/APPEALED: [0-9]*""", '', new_body)
new_body

"30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [92]:
new_body = re.sub(r"""\nTHE FOLLOWING IS A LIST OF ERRORS FOUND:\n [0-9A-Z\n\-#=;'\., ]*""", '', new_body)
new_body

"30778                 03/28/2022  HERNANDEZ-GARCIA, EDUARDO             OLS                                    \n   NEIGHBORHOOD DEFENDER SERVICES  CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30868                 04/25/2022  AGUILLEN-ARTEAGA, JOSE PEDRO          OLS                                    \n   WINFREY, JACKSON                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30902                 05/02/2022  GONZALEZ-RAMIREZ, JOSUE               OLS                                    \n   EHRENBERG, BRIAN                CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30906                 05/05/2022  MOLINA VALENCIA, JUAN ANTONIO         OLS                                    \n   TEXAS RIO GRANDE LEGAL AID,     CRIMINAL TRESPASS                      OTHER MISDEMEANOR                                   \n30931                 05/12/2022  T

In [93]:
#Split the text on the '\n'
cases = new_body.split('\n')

In [94]:
#Establish a container list for the dictionaries
case_list = []
attorney_names = []
offense_list = []
st_rpt_list = []
temp_dict = {}

In [95]:
len(cases[0])

111

In [96]:
#Remove cases that happen to be empty or consist of whitespace only
cases = [case for case in cases if case.isspace() == False and len(case) > 0]

In [97]:
for line in cases:
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Attorney'] = attorney_names
            temp_dict['First Offense'] = offense_list
            temp_dict['ST RPT Column'] = st_rpt_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        attorney_names = []
        offense_list = []
        st_rpt_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[:22].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[22:34].strip()
        
        #Get defendant name
        temp_dict['Defendant'] = line[34:72].strip()
        
        #Get court
        temp_dict['Court'] = line[72:79].strip()
        
        #Get docket date
        temp_dict['Docket Date'] = line[79:89].strip()
        
        #Get outstanding warrants
        temp_dict['Outstanding Warrants'] = line[89:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get attorney name
        attorney_name = line[:35].strip()
        
        #Check if attorney_name is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if attorney_name.isspace() == False and len(attorney_name) > 0:
            attorney_names.append(attorney_name.strip())
        
        #Get first offense
        offense = line[35:74].strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #Get ST RPT Column
        st_rpt = line[74:].strip()
        
        #Check if st_rpt is all whitesapace. If not, strip it and add to names list
        #Also check that the string is not empty
        if st_rpt.isspace() == False and len(st_rpt) > 0:
            st_rpt_list.append(st_rpt.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Attorney'] = attorney_names
temp_dict['First Offense'] = offense_list
temp_dict['ST RPT Column'] = st_rpt_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [98]:
#Create dataframe
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
0,30778,03/28/2022,"HERNANDEZ-GARCIA, EDUARDO",OLS,,,[NEIGHBORHOOD DEFENDER SERVICES],[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1,30868,04/25/2022,"AGUILLEN-ARTEAGA, JOSE PEDRO",OLS,,,"[WINFREY, JACKSON]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
2,30902,05/02/2022,"GONZALEZ-RAMIREZ, JOSUE",OLS,,,"[EHRENBERG, BRIAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
3,30906,05/05/2022,"MOLINA VALENCIA, JUAN ANTONIO",OLS,,,"[TEXAS RIO GRANDE LEGAL AID,]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
4,30931,05/12/2022,"TORRES GALVAN, JOSE GUADALUPE",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
...,...,...,...,...,...,...,...,...,...
1306,36671,09/11/2024,"BENITEZ AGUILAR, JOSE ALEJANDRO",OLS,,,"[BUTTS, SONYA J.]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1307,36672,09/11/2024,"MONTERROS, RANDY ARIEL",OLS,,,"[CARABALLO, BARBARA]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1308,36673,09/11/2024,"PAIVA, RAFAEL",OLS,,,"[CHAVEZ, JONATHAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1309,36674,09/11/2024,"DE SANTANA, GEMERSON",OLS,,,"[CHAVEZ, JONATHAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]


# Build Function

Create a function to gather the info from the entire document.

In [18]:
def extract_criminal_ols_pdf_data(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered case info
    """
    
    #Initialize containers
    case_list = []
    attorney_names = []
    offense_list = []
    st_rpt_list = []
    temp_dict = {}
    
    #Separate the first header from the body
    #We'll use this to identify the county later
    header = text[:500]
    
    #Body
    body = text[500:]
    
    #Remove leading and trailing whitespaces from the body text
    body = body.strip()
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if the name of the district clerk changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z -]*\d{2}/\d{2}/\d{4}\n\s*\w{6,8}[A-Z0-9 -]*\n\s*[A-Z ]*\d{2}/\d{2}/\d{4}[A-Z0-9 -]*\n\n[A-Z ]*#\s*[A-Z -']*\n[A-Z0-9/ -]*\n\n""", '', body)
    
    #########################################################################################################
    #Now remove the last divider sections using regex
    body = re.sub(r"""\nTOTAL NUMBER OF CASES FILED: [0-9\n-]*MTR-A[A-Z\n ]*[-]*""", '', body)
    
    body = re.sub(r"""\nTOTAL NUMBER OF MTR-A FILINGS: [0-9\n-]*ALL OTHER CASES ADDED/APPEALED[\n-]*""", '', body)
    
    body = re.sub(r"""\nTOTAL NUMBER OF CASES ADDED/APPEALED: [0-9- a-zA-Z\.#;,:'=\n]*""", '', body)
    
    body = re.sub(r"""\nTHE FOLLOWING IS A LIST OF ERRORS FOUND:\n [0-9A-Z\n\-#=;'\., ]*""", '', body)
    
    #########################################################################################################
    
    #Split the text on the '\n' to isolate each case
    cases = body.split('\n')
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                temp_dict['Attorney'] = attorney_names
                temp_dict['First Offense'] = offense_list
                temp_dict['ST RPT Column'] = st_rpt_list

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            attorney_names = []
            offense_list = []
            st_rpt_list = []
            
            #Assign county
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[:22].strip()
            
            #Assign case type
            temp_dict['Case Type'] = 'Criminal'
            
            #Assign status
            temp_dict['Status'] = 'Open'

            #Gather the file date
            temp_dict['File Date'] = line[22:34].strip()

            #As of 13 June 2023, we are no longer collecting names
            #Get defendant name
            temp_dict['Defendant'] = line[34:72].strip()
        
            #Get court
            temp_dict['Court'] = line[72:79].strip()

            #Get docket date
            temp_dict['Docket Date'] = line[79:89].strip()

            #Get outstanding warrants
            temp_dict['Outstanding Warrants'] = line[89:].strip()

            #End of line, so move to next one

        else:
            #As of 13 June 2023, we are no longer collecting names
            
            #Get attorney name
            attorney_name = line[:35].strip()

            #Check if attorney_name is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            if attorney_name.isspace() == False and len(attorney_name) > 0:
                attorney_names.append(attorney_name.strip())

            #Get first offense
            offense = line[35:74].strip()

            #Check if offense is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            if offense.isspace() == False and len(offense) > 0:
                offense_list.append(offense.strip())

            #Get ST RPT Column
            st_rpt = line[74:].strip()

            #Check if st_rpt is all whitesapace. If not, strip it and add to names list
            #Also check that the string is not empty
            if st_rpt.isspace() == False and len(st_rpt) > 0:
                st_rpt_list.append(st_rpt.strip())

            #End of line
        
    #Check that the last case was added to the list
    #If not, add it
    #Add list info to temp_dict
    temp_dict['Attorney'] = attorney_names
    temp_dict['First Offense'] = offense_list
    temp_dict['ST RPT Column'] = st_rpt_list

    #Add temp dict data to case_list
    case_list.append(temp_dict)

    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    return df

# Test Function

In [19]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/MAV_OLS_Pending.pdf'

In [20]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [21]:
df = extract_criminal_ols_pdf_data(text)

Collected Data From 1311 Cases.


In [22]:
df

Unnamed: 0,County,Cause Number,Case Type,Status,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
0,Maverick,30778,Criminal,Open,03/28/2022,"HERNANDEZ-GARCIA, EDUARDO",OLS,,,[NEIGHBORHOOD DEFENDER SERVICES],[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1,Maverick,30868,Criminal,Open,04/25/2022,"AGUILLEN-ARTEAGA, JOSE PEDRO",OLS,,,"[WINFREY, JACKSON]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
2,Maverick,30902,Criminal,Open,05/02/2022,"GONZALEZ-RAMIREZ, JOSUE",OLS,,,"[EHRENBERG, BRIAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
3,Maverick,30906,Criminal,Open,05/05/2022,"MOLINA VALENCIA, JUAN ANTONIO",OLS,,,"[TEXAS RIO GRANDE LEGAL AID,]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
4,Maverick,30931,Criminal,Open,05/12/2022,"TORRES GALVAN, JOSE GUADALUPE",OLS,,,"[STARLING, KATHY]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
...,...,...,...,...,...,...,...,...,...,...,...,...
1306,Maverick,36671,Criminal,Open,09/11/2024,"BENITEZ AGUILAR, JOSE ALEJANDRO",OLS,,,"[BUTTS, SONYA J.]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1307,Maverick,36672,Criminal,Open,09/11/2024,"MONTERROS, RANDY ARIEL",OLS,,,"[CARABALLO, BARBARA]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1308,Maverick,36673,Criminal,Open,09/11/2024,"PAIVA, RAFAEL",OLS,,,"[CHAVEZ, JONATHAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]
1309,Maverick,36674,Criminal,Open,09/11/2024,"DE SANTANA, GEMERSON",OLS,,,"[CHAVEZ, JONATHAN]",[CRIMINAL TRESPASS],[OTHER MISDEMEANOR]


__Convert Lists To Strings__

In [23]:
def convert_name_list_to_string(name_list):
    """
    This function takes in the list of names for the Plaintiff, Plaintiff Attorney, Defendant, and Defendant Attorney
    in the civil cases dataframe. It will join the names in each list with a new line character. This is necessary to
    upload the dataframe to a google sheet.

    Parameter:
        - name_list: The list of names

    Returns:
        - string: A single string consisting of all the names in the list joined by new lines.
    """

    name_string = '\n'.join(name_list)
    
    return name_string

In [24]:
df['First Offense'] = df['First Offense'].map(convert_name_list_to_string)
df['ST RPT Column'] = df['ST RPT Column'].map(convert_name_list_to_string)
df['Attorney'] = df['Attorney'].map(convert_name_list_to_string)

In [25]:
df[df['Cause Number'] == '36649']

Unnamed: 0,County,Cause Number,Case Type,Status,File Date,Defendant,Court,Docket Date,Outstanding Warrants,Attorney,First Offense,ST RPT Column
1284,Maverick,36649,Criminal,Open,08/30/2024,"MOHENO SANCHEZ, YANS",OLS,,,"MARES, ALFREDO",CRIM TRESPASS\nHABIT/SHLTR/SUPRFUND/INFSTRT,OTHER MISDEMEANOR


# Send Data To Google Sheets

In [26]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [27]:
#Set up credentials
gc = gspread.service_account(filename='credentials.json')

In [28]:
#Open Google Sheet By Name
gsheet = gc.open("Pending Reports")

In [29]:
#OLS cases go to the 'OLS Cases' tab
ols_sheet = gsheet.worksheet('OLS Cases')

In [30]:
#Clear what's currently on the sheet
ols_sheet.clear()

{'spreadsheetId': '1b3fmZrbfwZWMvu4kUGJSSGsp61utlE0Ny-ebozZ5aBk',
 'clearedRange': "'OLS Cases'!A1:Z2312"}

In [31]:
ols_sheet.update([df.columns.values.tolist()] + df.values.tolist())

{'spreadsheetId': '1b3fmZrbfwZWMvu4kUGJSSGsp61utlE0Ny-ebozZ5aBk',
 'updatedRange': "'OLS Cases'!A1:L1312",
 'updatedRows': 1312,
 'updatedColumns': 12,
 'updatedCells': 15744}