## Import Libraries

In [1]:
# send requests and parse data
import requests
import json
from bs4 import BeautifulSoup
from lxml import etree


# data wrangling and organizing
import pandas as pd
import re

## Create Variables

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.58',
    'Content-Type': 'application/x-www-form-urlencoded'
}

data = {
    'sfilter': 0
}

url = 'http://muop-mupdreports.missouri.edu/dclog.php?sfilter=0'

In [3]:
# define list that will be converted to dataframe
incident_list = []

## Scrape Data

In [4]:
r = requests.post(url, headers=headers, data=data)

In [5]:
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
html = etree.HTML(str(soup))

In [7]:
incidents_raw = html.xpath('//div[@class="report_table"]//tr')

## Parse Data

In [8]:
# this contains all the information
incidents_info_list = incidents_raw[2::2]

# for some reason, the time occured is in its own tr tag
time_occured_list = incidents_raw[3::2]

In [9]:
for incident, time_occured in zip(incidents_info_list, time_occured_list):
    temp_list = []
    for column in incident:
        temp_list.append(column.text)
    temp_list.append(time_occured.getchildren()[0].getchildren()[0].text)
    incident_list.append(temp_list)

## Convert to Dataframe and Export

In [10]:
df = pd.DataFrame(incident_list)

In [11]:
# reindex columns so that time_reported and time_occured are next to each other
df = df.reindex(columns=[0, 1, 7, 2, 3, 4, 5, 6])

In [12]:
# rename columns
df = df.rename(columns={
        0: 'case_number', 
        1: 'time_reported', 
        2: 'location_of_occurence', 
        3: 'domestic_relationship',
        4: 'incident_type',
        5: 'criminal_offense',
        6: 'disposition',
        7: 'time_occured'
    })

In [13]:
df.to_csv('../data/crime-log.csv', index=False)