In [1]:
import numpy as np
import pandas as pd
import re
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

In [4]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Juvenile_Case_Reports/MJU as of 8 24 23.pdf'

In [5]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [6]:
text

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                                                \n\n453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS           

# Isolate Header

In [7]:
header = text[:420]
header

'                         COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)                                                                    '

# Isolate Body

In [9]:
body = text[420:]
body

'                            \n\n453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT     

In [10]:
#Remove whitespace surrounding the body and header
header = header.strip()
body = body.strip()

In [11]:
header

'COURT COORDINATOR - JUVENILE CASE FILINGS\n              FILE DATES: ON OR AFTER 08/24/2023 - COURT: 293RD DISTRICT COURT\n              FILING COUNTY: MAVERICK - FILING OFFICE: DISTRICT CLERK - PAGE 1\n\nKEY              CAUSE #   FILE DATE  COURT                           RESPONDENT                              \n     OFFENSE(S)'

In [12]:
body

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            PEDRO MARTINEZ           

# Remove Subsequent Headers

In [19]:
#Set up regex to remove all subsequent headers
#This regex should identify the headers even if the name of the district clerk changes later on
#Since all current juvenile case reports are only one page, make it work on the header and then add in
# a page break at the beginning
new_body = re.sub(r"""\n\x0c\s*[A-Z0-9 \n/#\:-]*\(S\)\s*\n\n""", '', body)
new_body

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   \n454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       \n455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 \n456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                \n457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                \n458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  \n459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         \n2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  \n3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            PEDRO MARTINEZ           

# Split Into Individual Cases

In [20]:
cases = new_body.split('\n')
cases

['453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   ',
 '454      19-02-00556-MJU  02/07/2019  293RD DISTRICT COURT            RODRIGUEZ, ANTHONY                       ',
 '455      19-03-00558-MJU  03/20/2019  293RD DISTRICT COURT            CASTILLO                                 ',
 '456      19-04-00560-MJU  04/08/2019  293RD DISTRICT COURT            RODRIGUEZ                                ',
 '457      19-07-00562-MJU  07/09/2019  293RD DISTRICT COURT            CONTRERAS                                ',
 '458      19-08-00564-MJU  08/29/2019  293RD DISTRICT COURT            SANCHEZ                                  ',
 '459      20-01-00566-MJU  01/09/2020  293RD DISTRICT COURT            JIMENEZ, BRAULIO                         ',
 '2        20-02-00568-MJU  02/26/2020  293RD DISTRICT COURT            JUSTIN ANDREW RODRIGUEZ                  ',
 '3        20-05-00570-MJU  05/14/2020  293RD DISTRICT COURT            

# Drop the Last Case

It's just the total case count

In [21]:
cases[-1]

'TOTAL JUVENILE CASES LISTED: 24'

In [22]:
cases.pop()
cases[-1]

''

# Find the Data

In [23]:
#Only work with the first case for now
case = cases[0]
case

'453      19-01-00554-MJU  01/10/2019  293RD DISTRICT COURT            GUERRA                                   '

__Key__

In [27]:
key = case[:9].strip()
key

'453'

__Cause Number__

In [32]:
cause_number = case[9:26].strip()
cause_number

'19-01-00554-MJU'

__File Date__

In [36]:
file_date = case[26:38].strip()
file_date

'01/10/2019'

__Court__

In [44]:
court = case[38:70].strip()
court

'293RD DISTRICT COURT'

__Respondent__

In [46]:
respondent = case[70:].strip()
respondent

'GUERRA'

# Build a Dataframe

In [47]:
#Establish a container list for the dictionaries
case_list = []
offense_list = []
temp_dict = {}

In [52]:
for line in cases:
    if line.isspace() or len(line) == 0:
        continue
    
    #Check if line is the start of a new case
    if not line[0].isspace():
        #Check if the temp_dict is empty.
        #If not, add temp_dict data to case_list
        if bool(temp_dict) == True:
            #Add list info to temp_dict
            temp_dict['Offense'] = offense_list
            
            #Add temp dict data to case_list
            case_list.append(temp_dict)
        
        #Reset temp_dict
        temp_dict = {}
        
        #Reset lists
        offense_list = []
        
        #Gather the cause number
        temp_dict['Cause Number'] = line[9:26].strip()
        
        #Gather the file date
        temp_dict['File Date'] = line[26:38].strip()
        
        #Get court
        #temp_dict['Court'] = line[38:70].strip()
        
        #Get respondent
        #temp_dict['Respondent'] = line[70:].strip()
        
        #End of line, so move to next one
    
    else:
        #Get offenses
        offense = line.strip()
        
        #Check if offense is all whitesapace. If not, strip it and add to list
        #Also check that the string is not empty
        if offense.isspace() == False and len(offense) > 0:
            offense_list.append(offense.strip())
        
        #End of line
        
#Check that the last case was added to the list
#If not, add it
#Add list info to temp_dict
temp_dict['Offense'] = offense_list

#Add temp dict data to case_list
case_list.append(temp_dict)

In [54]:
len(case_list)

24

In [55]:
case_list

[{'Cause Number': '19-01-00554-MJU', 'File Date': '01/10/2019', 'Offense': []},
 {'Cause Number': '19-02-00556-MJU', 'File Date': '02/07/2019', 'Offense': []},
 {'Cause Number': '19-03-00558-MJU', 'File Date': '03/20/2019', 'Offense': []},
 {'Cause Number': '19-04-00560-MJU', 'File Date': '04/08/2019', 'Offense': []},
 {'Cause Number': '19-07-00562-MJU', 'File Date': '07/09/2019', 'Offense': []},
 {'Cause Number': '19-08-00564-MJU', 'File Date': '08/29/2019', 'Offense': []},
 {'Cause Number': '20-01-00566-MJU', 'File Date': '01/09/2020', 'Offense': []},
 {'Cause Number': '20-02-00568-MJU', 'File Date': '02/26/2020', 'Offense': []},
 {'Cause Number': '20-05-00570-MJU', 'File Date': '05/14/2020', 'Offense': []},
 {'Cause Number': '21-05-00572-MJU', 'File Date': '05/12/2021', 'Offense': []},
 {'Cause Number': '21-09-00574-MJU', 'File Date': '09/24/2021', 'Offense': []},
 {'Cause Number': '21-12-00576-MJU', 'File Date': '12/07/2021', 'Offense': []},
 {'Cause Number': '22-03-00578-MJU',
  '

In [56]:
df = pd.DataFrame(case_list)
df

Unnamed: 0,Cause Number,File Date,Offense
0,19-01-00554-MJU,01/10/2019,[]
1,19-02-00556-MJU,02/07/2019,[]
2,19-03-00558-MJU,03/20/2019,[]
3,19-04-00560-MJU,04/08/2019,[]
4,19-07-00562-MJU,07/09/2019,[]
5,19-08-00564-MJU,08/29/2019,[]
6,20-01-00566-MJU,01/09/2020,[]
7,20-02-00568-MJU,02/26/2020,[]
8,20-05-00570-MJU,05/14/2020,[]
9,21-05-00572-MJU,05/12/2021,[]


# Build the Function

In [62]:
def extract_juvenile_cases(text):
    """
    This function takes in the entire PDF document as a string of text. It will gather the info for each case
    and add the info to a dictionary. The dictionary for each case will be added to a list which will be turned into
    a dataframe.
    
    Parameter:
        -text: A string consisting of the text of the entire juvenile cases PDF document.
        
    Returns:
        -df: A dataframe of the newly gathered juvenile case info
    """
    
    #Initialize containers
    #Establish a container list for the dictionaries
    case_list = []
    offense_list = []
    temp_dict = {}
    
    #Get the header and remove surrounding whitespace
    header = text[:420].strip()

    #Get the body and remove surrounding whitespace
    body = text[420:].strip()
    
    #Get the 'AS OF' date:
    report_as_of_date = re.findall(r"[0-9]{2}/[0-9]{2}/[0-9]{4}", header)[0]
    
    #Use if statement to check for county names inside the header info
    if header.count('MAVERICK') >= 1:
        county = 'Maverick'
    elif header.count('DIMMIT') >= 1:
        county = 'Dimmit'
    elif header.count('ZAVALA') >= 1:
        county = 'Zavala'
    else:
        county = 'Something went wrong!'
        
    #Set up regex to remove all subsequent headers
    #This regex should identify the headers even if some of the info changes later on
    body = re.sub(r"""\n\x0c\s*[A-Z0-9 \n/#\:-]*\(S\)\s*\n\n""", '', body)
    
    #Split the text on the \n to isolate each case
    cases = body.split('\n')
    
    #Drop the last case. It's just the total case count from the report
    cases.pop()
    
    #Remove cases that happen to be empty or consist of whitespace only
    cases = [case for case in cases if case.isspace() == False and len(case) > 0]
    
    #Loop through each line. Add case info to temp dict, and then add that to the case list
    #Some fields are commented out because we don't need that info yet.
    for line in cases:
        #Check if line is the start of a new case
        if not line[0].isspace():
            #Check if the temp_dict is empty.
            #If not, add temp_dict data to case_list
            if bool(temp_dict) == True:
                #Add list info to temp_dict
                temp_dict['Offense'] = offense_list

                #Add temp dict data to case_list
                case_list.append(temp_dict)

            #Reset temp_dict
            temp_dict = {}

            #Reset lists
            offense_list = []
            
            #Assign the county
            temp_dict['County'] = county

            #Gather the cause number
            temp_dict['Cause Number'] = line[9:26].strip()

            #Gather the file date
            temp_dict['File Date'] = line[26:38].strip()

            #Get court
            #temp_dict['Court'] = line[38:70].strip()

            #Get respondent
            #temp_dict['Respondent'] = line[70:].strip()

            #End of line, so move to next one

        else:
            #Get offenses
            offense = line.strip()

            #Check if offense is all whitesapace. If not, strip it and add to list
            #Also check that the string is not empty
            if offense.isspace() == False and len(offense) > 0:
                offense_list.append(offense.strip())

            #End of line

    #Check that the last case was added to the list
    #If not, add it
    #Add list info to temp_dict
    temp_dict['Offense'] = offense_list

    #Add temp dict data to case_list
    case_list.append(temp_dict)
    
    #How many?
    print(f'Collected Data From {len(case_list)} Cases.')
    
    #Create dataframe
    df = pd.DataFrame(case_list)
    
    #Add report as of date
    df["Original As Of Date"] = report_as_of_date
    df["Last As Of Date"] = report_as_of_date
    
    return df

# Test Function

In [75]:
#Set up the pdf path
pdf_path = '/Users/johnathonsmith/Downloads/Juvenile_Case_Reports/MJU as of 8 24 23.pdf'

In [76]:
#Set up resource manager to handle pdf content. text, images, etc.
resource_manager = PDFResourceManager()

#Used to display text
fake_file_handle = io.StringIO()

#Set up converter
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())

#Set up page interpreter
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open(pdf_path, 'rb') as fh:

    for page_num, page in enumerate(PDFPage.get_pages(fh, caching=True, check_extractable=True)):
        #Process the current page
        page_interpreter.process_page(page)

    #Save the current page's text to a variable
    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

#print(text)

In [77]:
df = extract_juvenile_cases(text)

Collected Data From 24 Cases.


In [78]:
df

Unnamed: 0,County,Cause Number,File Date,Offense,Original As Of Date,Last As Of Date
0,Maverick,19-01-00554-MJU,01/10/2019,[],08/24/2023,08/24/2023
1,Maverick,19-02-00556-MJU,02/07/2019,[],08/24/2023,08/24/2023
2,Maverick,19-03-00558-MJU,03/20/2019,[],08/24/2023,08/24/2023
3,Maverick,19-04-00560-MJU,04/08/2019,[],08/24/2023,08/24/2023
4,Maverick,19-07-00562-MJU,07/09/2019,[],08/24/2023,08/24/2023
5,Maverick,19-08-00564-MJU,08/29/2019,[],08/24/2023,08/24/2023
6,Maverick,20-01-00566-MJU,01/09/2020,[],08/24/2023,08/24/2023
7,Maverick,20-02-00568-MJU,02/26/2020,[],08/24/2023,08/24/2023
8,Maverick,20-05-00570-MJU,05/14/2020,[],08/24/2023,08/24/2023
9,Maverick,21-05-00572-MJU,05/12/2021,[],08/24/2023,08/24/2023


# Create Logic for CSV Files

In [56]:
#Since it's a csv file, theres not much I need to do to acquire the data
df = pd.read_csv('/Users/johnathonsmith/Downloads/JU All Counties_20230919.csv')
df.head()

Unnamed: 0,Cause #,Respondent,Court,Offense,File<br>Date,Disposition<br>Date,MTR<br>File Date,MTR<br>Disposition Date,Next Docket Date
0,<b></b>,,293RD DISTRICT COURT,,,,,,
1,<b>08-03-00134-DJU</b>,"HERNANDEZ, GABRIEL LEDESMA",293RD DISTRICT COURT,AGG ASSAULT W/DEADLY WEAPON,3/20/2008,,,,
2,<b>08-10-00208-ZJU</b>,"MATA JR, MAURICIO JIMENEZ",293RD DISTRICT COURT,BURGLARY OF HABITATION,10/21/2008,,,,
3,<b>09-03-00212-ZJU</b>,"BALBOA, III, PABLO",293RD DISTRICT COURT,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",3/9/2009,6/10/2009,,,
4,<b>09-07-00146-DJU</b>,"RUBIO, FERMIN",293RD DISTRICT COURT,BURGLARY OF BUILDING,7/30/2009,,,,


In [57]:
len(df)

165

In [58]:
#Try removing the formatting from the cause numbers
string = df[df['Cause #'] == '<b><font color=red>12-05-00156-DJU</fon></b>']['Cause #']
string

31    <b><font color=red>12-05-00156-DJU</fon></b>
Name: Cause #, dtype: object

In [59]:
#Convert it to string
string = str(string)

In [60]:
#First remove the number and spaces at the beginning
string = string[string.find('<'):]
string

'<b><font color=red>12-05-00156-DJU</fon></b>\nName: Cause #, dtype: object'

In [61]:
new_string = re.sub(r'<b>', '', string)
new_string

'<font color=red>12-05-00156-DJU</fon></b>\nName: Cause #, dtype: object'

In [62]:
new_string = re.sub(r'<font color=red>', '', new_string)
new_string

'12-05-00156-DJU</fon></b>\nName: Cause #, dtype: object'

In [63]:
new_string = re.sub(r'</fon>', '', new_string)
new_string

'12-05-00156-DJU</b>\nName: Cause #, dtype: object'

In [64]:
new_string = re.sub(r'</b>', '', new_string)
new_string

'12-05-00156-DJU\nName: Cause #, dtype: object'

In [65]:
new_string = new_string.split('\n')[0]
new_string

'12-05-00156-DJU'

In [66]:
df['Cause #']

0                     <b></b>
1      <b>08-03-00134-DJU</b>
2      <b>08-10-00208-ZJU</b>
3      <b>09-03-00212-ZJU</b>
4      <b>09-07-00146-DJU</b>
                ...          
160     <b>23-07-0224-DJU</b>
161    <b>23-07-00592-MJU</b>
162    <b>23-07-CESAR-ZJU</b>
163    <b>23-07-JULIA-DJU</b>
164    <b>23-09-MICHA-MJU</b>
Name: Cause #, Length: 165, dtype: object

In [67]:
#Create function for getting cause numbers
def remove_formatting(val):
    """
    This function takes in the formatted cause numbers from the juvenile CSV file and removes the formatting.
    
    Parameter:
        - val: The formatted cause number
        
    Returns:
        - new_string: The cause number without formatting or extra characters
    """
    
    #Convert it to string
    string = str(val)
    
    #First remove the number and spaces at the beginning
    string = string[string.find('<'):]
    
    #Remove the rest of the formatting in order of possible occurrence
    new_string = re.sub(r'<b>', '', string)
    new_string = re.sub(r'<font color=red>', '', new_string)
    new_string = re.sub(r'</fon>', '', new_string)
    new_string = re.sub(r'</b>', '', new_string)
    
    #Now split on the newline and remove the rest of the extra characters after it
    new_string = new_string.split('\n')[0]
    
    return new_string

In [68]:
df['Cause #'] = df['Cause #'].apply(remove_formatting)

In [69]:
df['Cause #'][1]

'08-03-00134-DJU'

In [70]:
#Build function to determine county
def get_county_name(val):
    """
    This function takes in the juvenile cases cause number and determines the county of each case.
    
    Parameter:
        -val: The cause number as a string
    
    Returns:
        -county: The name of the county
    """
    
    if val.count("DJU") == 1:
        return 'Dimmit'
    elif val.count("ZJU") == 1:
        return 'Zavala'
    else:
        #Assume all others are Maverick county since it is most likely
        return 'Maverick'

In [71]:
df['County'] = df['Cause #'].apply(get_county_name)

In [72]:
df

Unnamed: 0,Cause #,Respondent,Court,Offense,File<br>Date,Disposition<br>Date,MTR<br>File Date,MTR<br>Disposition Date,Next Docket Date,County
0,,,293RD DISTRICT COURT,,,,,,,Maverick
1,08-03-00134-DJU,"HERNANDEZ, GABRIEL LEDESMA",293RD DISTRICT COURT,AGG ASSAULT W/DEADLY WEAPON,3/20/2008,,,,,Dimmit
2,08-10-00208-ZJU,"MATA JR, MAURICIO JIMENEZ",293RD DISTRICT COURT,BURGLARY OF HABITATION,10/21/2008,,,,,Zavala
3,09-03-00212-ZJU,"BALBOA, III, PABLO",293RD DISTRICT COURT,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",3/9/2009,6/10/2009,,,,Zavala
4,09-07-00146-DJU,"RUBIO, FERMIN",293RD DISTRICT COURT,BURGLARY OF BUILDING,7/30/2009,,,,,Dimmit
...,...,...,...,...,...,...,...,...,...,...
160,23-07-0224-DJU,JULIAN GABRIEL MAGALLAN,293RD DISTRICT COURT,SMUGGLING OF PERSONS: FIREARM,7/26/2023,,,,,Dimmit
161,23-07-00592-MJU,ASHLEY BERNABE LEYVA,293RD DISTRICT COURT,EVADING ARREST DET W/VEH,7/29/2023,,,,10/4/2023,Maverick
162,23-07-CESAR-ZJU,CESAR ROMERO NAVA,293RD DISTRICT COURT,,,,,,,Zavala
163,23-07-JULIA-DJU,JULIAN CARRIVALES,293RD DISTRICT COURT,,,,,,,Dimmit


In [73]:
#Remove all empty cause number rows
df = df[df['Cause #'] != '']
df

Unnamed: 0,Cause #,Respondent,Court,Offense,File<br>Date,Disposition<br>Date,MTR<br>File Date,MTR<br>Disposition Date,Next Docket Date,County
1,08-03-00134-DJU,"HERNANDEZ, GABRIEL LEDESMA",293RD DISTRICT COURT,AGG ASSAULT W/DEADLY WEAPON,3/20/2008,,,,,Dimmit
2,08-10-00208-ZJU,"MATA JR, MAURICIO JIMENEZ",293RD DISTRICT COURT,BURGLARY OF HABITATION,10/21/2008,,,,,Zavala
3,09-03-00212-ZJU,"BALBOA, III, PABLO",293RD DISTRICT COURT,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",3/9/2009,6/10/2009,,,,Zavala
4,09-07-00146-DJU,"RUBIO, FERMIN",293RD DISTRICT COURT,BURGLARY OF BUILDING,7/30/2009,,,,,Dimmit
5,09-08-00214-ZJU,"GONZALEZ, JUAN CARLOS",293RD DISTRICT COURT,BURGLARY OF HABITATION,8/20/2009,10/5/2009,,,,Zavala
...,...,...,...,...,...,...,...,...,...,...
160,23-07-0224-DJU,JULIAN GABRIEL MAGALLAN,293RD DISTRICT COURT,SMUGGLING OF PERSONS: FIREARM,7/26/2023,,,,,Dimmit
161,23-07-00592-MJU,ASHLEY BERNABE LEYVA,293RD DISTRICT COURT,EVADING ARREST DET W/VEH,7/29/2023,,,,10/4/2023,Maverick
162,23-07-CESAR-ZJU,CESAR ROMERO NAVA,293RD DISTRICT COURT,,,,,,,Zavala
163,23-07-JULIA-DJU,JULIAN CARRIVALES,293RD DISTRICT COURT,,,,,,,Dimmit


In [77]:
df['Court'] = '293'
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,Cause #,Respondent,Court,Offense,File<br>Date,Disposition<br>Date,MTR<br>File Date,MTR<br>Disposition Date,Next Docket Date,County
1,08-03-00134-DJU,"HERNANDEZ, GABRIEL LEDESMA",293,AGG ASSAULT W/DEADLY WEAPON,3/20/2008,,,,,Dimmit
2,08-10-00208-ZJU,"MATA JR, MAURICIO JIMENEZ",293,BURGLARY OF HABITATION,10/21/2008,,,,,Zavala
3,09-03-00212-ZJU,"BALBOA, III, PABLO",293,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",3/9/2009,6/10/2009,,,,Zavala
4,09-07-00146-DJU,"RUBIO, FERMIN",293,BURGLARY OF BUILDING,7/30/2009,,,,,Dimmit
5,09-08-00214-ZJU,"GONZALEZ, JUAN CARLOS",293,BURGLARY OF HABITATION,8/20/2009,10/5/2009,,,,Zavala
...,...,...,...,...,...,...,...,...,...,...
160,23-07-0224-DJU,JULIAN GABRIEL MAGALLAN,293,SMUGGLING OF PERSONS: FIREARM,7/26/2023,,,,,Dimmit
161,23-07-00592-MJU,ASHLEY BERNABE LEYVA,293,EVADING ARREST DET W/VEH,7/29/2023,,,,10/4/2023,Maverick
162,23-07-CESAR-ZJU,CESAR ROMERO NAVA,293,,,,,,,Zavala
163,23-07-JULIA-DJU,JULIAN CARRIVALES,293,,,,,,,Dimmit


In [80]:
df.rename(columns = {
    'Cause #':'Cause Number',
    'File<br>Date':'File Date',
    'Disposition<br>Date':'Disposed Dates',
    'Next Docket Date':'Docket Date'}
    ,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [81]:
df

Unnamed: 0,Cause Number,Respondent,Court,Offense,File Date,Disposed Dates,MTR<br>File Date,MTR<br>Disposition Date,Docket Date,County
1,08-03-00134-DJU,"HERNANDEZ, GABRIEL LEDESMA",293,AGG ASSAULT W/DEADLY WEAPON,3/20/2008,,,,,Dimmit
2,08-10-00208-ZJU,"MATA JR, MAURICIO JIMENEZ",293,BURGLARY OF HABITATION,10/21/2008,,,,,Zavala
3,09-03-00212-ZJU,"BALBOA, III, PABLO",293,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",3/9/2009,6/10/2009,,,,Zavala
4,09-07-00146-DJU,"RUBIO, FERMIN",293,BURGLARY OF BUILDING,7/30/2009,,,,,Dimmit
5,09-08-00214-ZJU,"GONZALEZ, JUAN CARLOS",293,BURGLARY OF HABITATION,8/20/2009,10/5/2009,,,,Zavala
...,...,...,...,...,...,...,...,...,...,...
160,23-07-0224-DJU,JULIAN GABRIEL MAGALLAN,293,SMUGGLING OF PERSONS: FIREARM,7/26/2023,,,,,Dimmit
161,23-07-00592-MJU,ASHLEY BERNABE LEYVA,293,EVADING ARREST DET W/VEH,7/29/2023,,,,10/4/2023,Maverick
162,23-07-CESAR-ZJU,CESAR ROMERO NAVA,293,,,,,,,Zavala
163,23-07-JULIA-DJU,JULIAN CARRIVALES,293,,,,,,,Dimmit


In [82]:
df = df[[
    'County',
    'Cause Number',
    'File Date',
    'Docket Date',
    'Court',
    'Offense',
    'Disposed Dates'
]]

In [83]:
df

Unnamed: 0,County,Cause Number,File Date,Docket Date,Court,Offense,Disposed Dates
1,Dimmit,08-03-00134-DJU,3/20/2008,,293,AGG ASSAULT W/DEADLY WEAPON,
2,Zavala,08-10-00208-ZJU,10/21/2008,,293,BURGLARY OF HABITATION,
3,Zavala,09-03-00212-ZJU,3/9/2009,,293,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",6/10/2009
4,Dimmit,09-07-00146-DJU,7/30/2009,,293,BURGLARY OF BUILDING,
5,Zavala,09-08-00214-ZJU,8/20/2009,,293,BURGLARY OF HABITATION,10/5/2009
...,...,...,...,...,...,...,...
160,Dimmit,23-07-0224-DJU,7/26/2023,,293,SMUGGLING OF PERSONS: FIREARM,
161,Maverick,23-07-00592-MJU,7/29/2023,10/4/2023,293,EVADING ARREST DET W/VEH,
162,Zavala,23-07-CESAR-ZJU,,,293,,
163,Dimmit,23-07-JULIA-DJU,,,293,,


In [84]:
#Replace NaN values
df.fillna('', inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,County,Cause Number,File Date,Docket Date,Court,Offense,Disposed Dates
1,Dimmit,08-03-00134-DJU,3/20/2008,,293,AGG ASSAULT W/DEADLY WEAPON,
2,Zavala,08-10-00208-ZJU,10/21/2008,,293,BURGLARY OF HABITATION,
3,Zavala,09-03-00212-ZJU,3/9/2009,,293,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",6/10/2009
4,Dimmit,09-07-00146-DJU,7/30/2009,,293,BURGLARY OF BUILDING,
5,Zavala,09-08-00214-ZJU,8/20/2009,,293,BURGLARY OF HABITATION,10/5/2009
...,...,...,...,...,...,...,...
160,Dimmit,23-07-0224-DJU,7/26/2023,,293,SMUGGLING OF PERSONS: FIREARM,
161,Maverick,23-07-00592-MJU,7/29/2023,10/4/2023,293,EVADING ARREST DET W/VEH,
162,Zavala,23-07-CESAR-ZJU,,,293,,
163,Dimmit,23-07-JULIA-DJU,,,293,,


In [103]:
#Use filename to get date values
filename = 'JU All Counties_20230919.csv'

as_of_date = filename[-12:-4]
as_of_date

'20230919'

In [104]:
as_of_date = as_of_date[4:6] + '/' + as_of_date[6:] + '/' + as_of_date[:4]
as_of_date

'09/19/2023'

In [106]:
#Get status of each case
def determine_case_status(val):
    """
    This function takes in the disposed date column values. If the value is not an empty string it will assign
    the case status as 'Open'. Otherwise, it will be labeled as 'Disposed'.
    
    Parameter:
        - val: A string representing the disposed date of the associated case.
                If not disposed, val will be an empty string.
    
    Returns:
        - string: Either 'Open' or 'Disposed' based on the value
    """
    
    if len(val) == 0:
        return 'Open'
    else:
        return 'Disposed'

In [107]:
df['Status'] = df['Disposed Dates'].apply(determine_case_status)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,County,Cause Number,File Date,Docket Date,Court,Offense,Disposed Dates,Status
1,Dimmit,08-03-00134-DJU,3/20/2008,,293,AGG ASSAULT W/DEADLY WEAPON,,Open
2,Zavala,08-10-00208-ZJU,10/21/2008,,293,BURGLARY OF HABITATION,,Open
3,Zavala,09-03-00212-ZJU,3/9/2009,,293,"CRIM MISCH <$1,500 HAB DAMAGE FIREARM/EXPLOSIVE",6/10/2009,Disposed
4,Dimmit,09-07-00146-DJU,7/30/2009,,293,BURGLARY OF BUILDING,,Open
5,Zavala,09-08-00214-ZJU,8/20/2009,,293,BURGLARY OF HABITATION,10/5/2009,Disposed
...,...,...,...,...,...,...,...,...
160,Dimmit,23-07-0224-DJU,7/26/2023,,293,SMUGGLING OF PERSONS: FIREARM,,Open
161,Maverick,23-07-00592-MJU,7/29/2023,10/4/2023,293,EVADING ARREST DET W/VEH,,Open
162,Zavala,23-07-CESAR-ZJU,,,293,,,Open
163,Dimmit,23-07-JULIA-DJU,,,293,,,Open


In [None]:
#Build overall function
def build_juvenile_dataframe_from_csv(filename, df):
    """
    This function takes in the file name and base df created using the juvenile csv file.
    It will use the filename to get the date column values and perform several transformations
    on the base df. The end result will be a fully prepared juvenile cases dataframe.
    
    Parameters:
        - filename: The name of the juvenile csv file. Contains the date it was created.
        - df: The base df created from running pd.read_csv() on the file object.
        
    Returns:
        - df: The fully prepared version of the base df.
    """
    
    #Remove the formatting on the cause number column
    df['Cause #'] = df['Cause #'].apply(remove_formatting)
    
    #Get the county name associated with each case
    df['County'] = df['Cause #'].apply(get_county_name)
    
    #Remove all empty cause number rows
    df = df[df['Cause #'] != '']
    
    #Assign court value
    df['Court'] = '293'
    
    #Rename columns
    df.rename(columns = {
    'Cause #':'Cause Number',
    'File<br>Date':'File Date',
    'Disposition<br>Date':'Disposed Dates',
    'Next Docket Date':'Docket Date'}
    ,inplace = True)
    
    #Reorder and remove unwanted columns
    df = df[[
        'County',
        'Cause Number',
        'File Date',
        'Docket Date',
        'Court',
        'Offense',
        'Disposed Dates'
    ]]
    
    #Replace NaN values
    df.fillna('', inplace = True)
    
    #Get date values from file name
    as_of_date = filename[-12:-4]
    as_of_date = as_of_date[4:6] + '/' + as_of_date[6:] + '/' + as_of_date[:4]
    
    df['Report Generated Date'] = as_of_date
    df['Original As Of Date'] = as_of_date
    df['Last As Of Date'] = as_of_date
    
    #Add comments column
    df['Comments'] = ''
    
    #Add Status column
    df['Status'] = df['Disposed Dates'].apply(determine_case_status)
    
    return df