In [21]:
import os.path as path # used for easily finding the csvs in other directories
from PyPDF2 import PdfReader # used to read and extract text from PDFs

import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from datetime import datetime

In [5]:
# loading csv from local directory function
def load_within_directory(directory_string):
    """
    Takes in a string of where your csv is located in the repo folder and turns
    into a dataframe using pandas read_csv

    example directory string: '/data/raw_data/CIE/client_needs_table.csv'
    :param directory_string: string containing file with directory desired
    :return: a dataframe
    """

    temp_path = path.abspath(path.join("data" ,"../../..")) # finds the parent directory

    # concatenates with directory
    return temp_path + directory_string

In [7]:
# loading sheriff pdf
sheriff_pdf = PdfReader(load_within_directory('/data/raw_data/sheriff_evictions_2018_jan_2023/sdso_lockout_service_activity_details_jan_2018_jan_2023.pdf'))

In [9]:
# this cell is just to detail what will go on in the pdf
# parsing loop

# We need to check the length of pages in the file
print(len(sheriff_pdf.pages)) # should be 1184

# grab a specific page from the pdf file
temp_page = sheriff_pdf.pages[0]

# then extract the text from page and save it
temp_text = temp_page.extract_text()
print(temp_text)

1184
SAN DIEGO COUNTY SHERIFF'S OFFICE
Eviction List
TO 01/01/2018 01/31/2023
Office Phone #: (619) 544-6401
Address Status File Number Occupants Restoration Date Time
Chula Vista
2015250864 NOTFOUND Alex Luevano,All Unknown 
Occupants,Carlos 
Luevano1256 8th Street
(Front and Back House)
Imperial Beach, CA 9193204/21/2021  12:00 AM
2015250883 CANCELLED Audra Souza, Anthony A . 
Souza464 E H St #503
Chula Vista , CA 9191005/20/2020  12:00 AM
2017253065 SERVED Maria Varela ,Luis Matheu 
III1754 Via Costina
San Diego, CA 9217301/04/2018  12:00 AM
2017253067 SERVED All Unknown 
Occupants, Charvella West1357 Burgundy Dr
Chula Vista , CA 9191301/04/2018  12:00 AM
2017253088 SERVED Hildelisa Ochoa,All 
Unknown Occupants918 Tenth St , #3
Coronado, CA 9211801/03/2018  12:00 AM
2017253090 SERVED All Unknown 
Occupants,Raphael 
Vazquez ,Vanessa Rachel 
Vazquez1746 Via Capri
Chula Vista , CA 9191301/03/2018  12:00 AM
2017253147 SERVED Rogel Trucking , 
LLC,Jeremias Rogel Jr,All 
Unknown Occupants

# SDSO PDF Extraction Tool
## How this works
* First, the PDF is read from the above code, luckily it is fairly light on images so it is no larger than 4 MB
* Second, the loop below will scrape every single page of text and store it into a large object (like a big text file)
    * This makes it easier to parse through and format into a dataframe
* Third, we need to make a string parser loop which will store a dictionary of each "column" for the dataframe. This is a bit difficult as we will essentially need to figure out how many lines each "row" this takes up.
    * The parser divides the giant text document into rows, then adds them to each respective column pieceewise
* Each column will be merged into a single dataframe with the hopes that it will be much easier to analyse.
---

In [10]:
# making a string object to append to
raw_pdf_string = ""

for i in range(0, len(sheriff_pdf.pages)):
    temp_page = sheriff_pdf.pages[i]
    temp_text = temp_page.extract_text()
    raw_pdf_string = raw_pdf_string + temp_text


So the format for each row should be:

Chula Vista

2015250864 NOTFOUND Alex Luevano,All Unknown

Occupants,Carlos

Luevano1256 8th Street

(Front and Back House)

Imperial Beach, CA 9193204/21/2021  12:00 AM

* First we need to remove the initial title
* Then try chunking them out into a proper row or column header/unneeded info

---

In [146]:
# converting lines into a list to index through
raw_pdf_list = raw_pdf_string.splitlines()
del raw_pdf_list[0:6] # deleting the police header to make it easier to process
# very odd "Chula Vista" entry? Was this a mistake?

In [147]:
# okay, so we can see that the items starting with "Address" should be removed
# any lines beginning with "Printed:" also must be removed
# look at this for an example, 'Printed:' in raw_pdf_list[46]
for i in reversed(range(len(raw_pdf_list))): # reversing through list as dropping indeces causes issues
    if 'Printed:' in raw_pdf_list[i]:
        del raw_pdf_list[i]
# now we have no more headings or needless info other than rows!

In [148]:
# next, we need to split up all of these into row cells
# the best way would be to look for the file no. and end at the time
# raw_pdf_list[0][0:2] to check for 20 or 30
# raw_pdf_list[4][-2::1] # to check for AM or PM
raw_rows = []
temp_string = ""

for string in raw_pdf_list:
    temp_string = temp_string + string
    if (string[-2::1] == "AM") or (string[-2::1] == "PM"):
        raw_rows.append(temp_string)
        temp_string = ""

# also holy moly, is it just an error or laziness?
# all the recorded Times as 12:00AM...

In [149]:
# first lets get the case no.
# easy enough, just strip the first 10 digits from the left
case_no_list = []
for i in range(0, len(raw_rows), 1):
    case_no = raw_rows[i][0:10]
    case_no_list.append(case_no)
    raw_rows[i] = raw_rows[i].replace(case_no, '')
    raw_rows[i] = raw_rows[i].lstrip()

In [151]:
# next for the case statuses
status_list = []
for i in range(0, len(raw_rows), 1):
    all_words = raw_rows[i].split()
    status = all_words[0]
    del all_words[0]
    raw_rows[i] = ' '.join(all_words)
    status_list.append(status)

In [170]:
# next are the dates
# raw_rows[0][-19::1] is the format
date_list = []
for i in range(0, len(raw_rows), 1):
    date = raw_rows[i][-19::1]
    date_list.append(date)
    raw_rows[i] = raw_rows[i].replace(date, '')
    raw_rows[i] = raw_rows[i].rstrip()

In [180]:
# current progress, almost done, just need to split the
# names and addresses. This Might be a little tricky...

# For now, here is the current dataframe, we will instead add in the columns
# cleaned from the library's method
dataframe_dict = {"File_Number":case_no_list, "Raw_Rows":raw_rows, "Restoration_Date":date_list, "Status":status_list}
current_df = pd.DataFrame(dataframe_dict)
current_df.head()

In [188]:
# loading library df from EvictionPDF_altered script
library_df = pd.read_csv(load_within_directory('/data/raw_data/sheriff_evictions_2018_jan_2023/evictions_library_export.csv'))

In [189]:
library_df

Unnamed: 0,File_Number,Occupants,Address,Restoration_Date,Time,Status,street,unit1,unit2,unit3,unit4,unit5,city,state,zip
0,Chula Vista\n2015250864,"Alex Luevano,All Unknown\nOccupants,Carlos\nLu...",1256 8th Street\n(Front and Back House)\nImper...,04/21/2021,12:00 AM,NOTFOUND,1256 8th Street\n(Front and Back House),,,,,,Imperial Beach,CA,91932
1,2015250883,"Audra Souza,Anthony A.\nSouza","464 E H St #503\nChula Vista, CA 91910",05/20/2020,12:00 AM,CANCELLED,464 E H St,503,503,,,,Chula Vista,CA,91910
2,2017253065,"Maria Varela,Luis Matheu\nIII","1754 Via Costina\nSan Diego, CA 92173",01/04/2018,12:00 AM,SERVED,1754 Via Costina,,,,,,San Diego,CA,92173
3,2017253067,"All Unknown\nOccupants,Charvella West","1357 Burgundy Dr\nChula Vista, CA 91913",01/04/2018,12:00 AM,SERVED,1357 Burgundy Dr,,,,,,Chula Vista,CA,91913
4,2017253088,"Hildelisa Ochoa,All\nUnknown Occupants","918 Tenth St, #3\nCoronado, CA 92118",01/03/2018,12:00 AM,SERVED,918 Tenth St,3,3,,,,Coronado,CA,92118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14831,2023450129,"All Unknown\nOccupants,John M. Caro","145 W El Norte Parkway #225\nEscondido, CA 92026",01/27/2023,12:00 AM,CANCELLED,145 W El Norte Parkway,225,225,,,,Escondido,CA,92026
14832,2023450129,"All Unknown\nOccupants,John M. Caro","145 W El Norte Parkway Garage\n#56Escondido,\n...",01/27/2023,12:00 AM,CANCELLED,145 W El Norte Parkway Garage\n,"56Escondido,","56Escondido,",,,,CA 92026,,
14833,2023450131,"Jose Agular,Johanna\nFuentes,All Unknown\nOccu...","514 N. Vine Street, Unit 1\nFallbrook, CA 92028",01/30/2023,12:00 AM,SERVED,514 N. Vine Street,1,,1,,,Fallbrook,CA,92028
14834,2023450132,"Kelly A. Celella,Eugene\nCatibog,Craig Nicholas","9986 Via Daroca\nSan Diego, CA 92129",01/27/2023,12:00 AM,OPEN,9986 Via Daroca,,,,,,San Diego,CA,92129


In [190]:
current_df

Unnamed: 0,File_Number,Raw_Rows,Restoration_Date,Status
0,2015250864,"Alex Luevano,All Unknown Occupants,Carlos Luev...",04/21/2021 12:00 AM,NOTFOUND
1,2015250883,"Audra Souza, Anthony A . Souza464 E H St #503C...",05/20/2020 12:00 AM,CANCELLED
2,2017253065,"Maria Varela ,Luis Matheu III1754 Via CostinaS...",01/04/2018 12:00 AM,SERVED
3,2017253067,"All Unknown Occupants, Charvella West1357 Burg...",01/04/2018 12:00 AM,SERVED
4,2017253088,"Hildelisa Ochoa,All Unknown Occupants918 Tenth...",01/03/2018 12:00 AM,SERVED
...,...,...,...,...
13626,2023450129,"All Unknown Occupants,John M. Caro145 W El Nor...",01/27/2023 12:00 AM,CANCELLED
13627,2023450129,"All Unknown Occupants,John M. Caro145 W El Nor...",01/27/2023 12:00 AM,CANCELLED
13628,2023450131,"Jose Agular ,Johanna Fuentes,All Unknown Occup...",01/30/2023 12:00 AM,SERVED
13629,2023450132,"Kelly A . Celella,Eugene Catibog,Craig Nichola...",01/27/2023 12:00 AM,OPEN
