In [1]:
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import tempfile
import re
import logging as log

import PyPDF2
import pandas as pd
import numpy as np

urls = [
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-01_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-02_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-03_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-04_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-05_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-06_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-07_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-08_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-09_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-10_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-11_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-12_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-14_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-15_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-16_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-17_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-18_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-19_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-20_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-21_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-22_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-23_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-24_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-25_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-26_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-27_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-03/2022-02-28_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-03/2022-03-01_daily_incident_summary.pdf',
    'https://www.normanok.gov/sites/default/files/documents/2022-03/2022-03-02_daily_incident_summary.pdf'    
]

In [2]:
def fetch_incidents(url):
    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"                          

        filename = './data.pdf'
        with urlopen(Request(url, headers=headers)) as resp:
            with open(filename, 'w+b') as fp:
                fp.write(resp.read())

        return filename
    except Exception as e:
        log.error(e)

In [3]:
def extract_incidents(filename, is_test=False):
    if filename is None:
        return None
    
    fp = open(filename, 'rb')

    pdf_reader = PyPDF2.pdf.PdfFileReader(fp)
    pages_count = pdf_reader.numPages

    pages_text = []
    for i in range(pages_count):
        page_text = pdf_reader.getPage(i).extractText()
        pages_text.append(page_text)

    fp.close()
    
    data = []
    for page_no, page_text in enumerate(pages_text):
        temp_data = []
        if not is_test:
            temp_data += re.findall(r'(\d+/\d+/\d{4}.\d+:\d\d)\n(\d{4}-\d{8})\n([0-9A-Z,\.;#\'<>&\(\) /-]*)\n([A-Za-z0-9 /]*)\n([0-9A-Z /]*)', page_text)
            temp_data += re.findall(r'(\d+/\d+/\d{4}.\d+:\d\d)\n(\d{4}-\d{8})()()\n([0-9A-Z /]*)\n\d+/\d+/\d{4}.\d+:\d\d', page_text)
        else:
            temp_data = re.findall(r'(\d+/\d+/\d{4}.\d+:\d\d)\n(\d{4}-\d{8})\n([0-9A-Z,\.;#\'<>&\(\) /-]*)', page_text)

        data += temp_data
    
    return data

In [4]:
def generate_dataframe(data, columns):
    df = pd.DataFrame(np.array(data), columns=columns)
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df.sort_values(by='DateTime').reset_index(drop=True)
    return df

In [5]:
def save_incidents(df, filename):
    df.to_csv(filename, index=False)

In [7]:
is_test = False
columns=['DateTime','IncidentNumber','IncidentORI'] if is_test else ['DateTime','IncidentNumber','Location','Nature','IncidentORI']
out_filename = './full.csv' if is_test else './filtered.csv'

data = []
for i, url in enumerate(urls):
    print(f"Extracting {i} - {url} ...")
    data_file = fetch_incidents(url)
    data += extract_incidents(data_file, is_test)

df = generate_dataframe(data, columns)
save_incidents(df, out_filename)
df

Extracting 0 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-01_daily_incident_summary.pdf ...
Extracting 1 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-02_daily_incident_summary.pdf ...
Extracting 2 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-03_daily_incident_summary.pdf ...
Extracting 3 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-04_daily_incident_summary.pdf ...
Extracting 4 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-05_daily_incident_summary.pdf ...
Extracting 5 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-06_daily_incident_summary.pdf ...
Extracting 6 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-07_daily_incident_summary.pdf ...
Extracting 7 - https://www.normanok.gov/sites/default/files/documents/2022-02/2022-02-08_daily_incident_summary.pdf ...
Extracting 8 - https://www.normanok.gov/

Unnamed: 0,DateTime,IncidentNumber,IncidentORI
0,2022-02-01 00:04:00,2022-00001588,15300 E LINDSEY ST
1,2022-02-01 00:04:00,2022-00002065,15300 E LINDSEY ST
2,2022-02-01 00:04:00,2022-00005621,15300 E LINDSEY ST
3,2022-02-01 00:06:00,2022-00002066,901 N PORTER AVE
4,2022-02-01 00:18:00,2022-00005622,612 E ROCK CREEK RD
...,...,...,...
8873,2022-03-02 23:33:00,2022-00011589,3300 W MAIN ST
8874,2022-03-02 23:35:00,2022-00011590,3600 W MAIN ST
8875,2022-03-02 23:45:00,2022-00011592,1315 GARFIELD AVE
8876,2022-03-02 23:46:00,2022-00011591,2583 W MAIN ST
