In [4]:
import requests
import time
import json
import datetime
import pandas as pd
from bs4 import BeautifulSoup

In [9]:
#load in sensitive information
keys={}
with open("keys.json","r") as f:
    keys = json.loads(f.read())
    
slack_url = keys["url"]

#Get an authenticated cookie for searches
def getCookie():
    session = requests.Session()
    url = 'http://casesearch.courts.state.md.us/casesearch/'
    params = {
            'disclaimer' : 'Y',
            'action' : 'Continue'
    }
    r = session.post(url, data=params)

    cook = session.cookies['JSESSIONID']

    time.sleep(1)

    return "JSESSIONID=" + cook
    
def getSingleCase(cookie, caseId):

    headers = {'Cookie': cookie}

    params = {
        'caseId' : caseId,
        'action': 'Get Case',
        'locationCode': 'B'
    }

    url = 'http://casesearch.courts.state.md.us/casesearch/inquiryByCaseNum.jis'
    r = requests.post(url, params=params, headers=headers)
    return r.text

#Run search for results
def getPage(cookie, page):
    headers = {'Cookie': cookie}

    params = {
        'd-16544-p': page,
        'lastName': '%', 
        'firstName' : '',
        'middleName': '',  
        'partyType': 'DEF',
        'site': 'CRIMINAL',
        'courtSystem': 'B',
        'countyName': 'ANNE ARUNDEL COUNTY',
        'filingStart': '1/14/2019',
        'filingEnd': '1/15/2019',
        'filingDate': '',
        'company': 'N',
        'action': 'Search',
    }

    url = 'http://casesearch.courts.state.md.us/casesearch/inquirySearch.jis'
    r = requests.post(url, params=params, headers=headers)
    time.sleep(1)
    return r.text

#Get charges for one individual cases
def getCharges(cookie, caseId):
    charges = []
    text = getSingleCase(cookie, caseId)
    soup = BeautifulSoup(text)
    windows = soup.find_all("div", attrs={'class':'AltBodyWindow1'})
    for window in windows:
        tables = window.find_all("table")
        for table in tables:
            for row in table.findAll('tr'):
                cell = row.findNext('td')
                if cell.text == 'Charge Description:':
                    target = cell.next_sibling
                    spans = target.find_all("span")
                    charge = spans[0].text
                            
                    charges.append(charge)
                if cell.text == 'Charge No:':
                    target = cell.next_sibling
                    spans = target.find_all("span")
                    if spans[1].text != "CJIS Code:":
                        charge = spans[2].text
                        charges.append(charge)

    return charges
    
#Run search and return information on all current cases  
def getCases(cookie):
    
    cases_on_page = 25
    page = 1
    
    caseIds = []
    links = []
    names = []
    types = []
    dates = []
    while (cases_on_page == 25):
        
        text = getPage(cookie, page)
        soup = BeautifulSoup(text)
        table = soup.find("table", attrs={'id':'row'})
        body = table.find("tbody")
        rows = body.find_all("tr")
        cases_on_page = len(rows)

        for row in rows:
            tds = row.find_all("td")
            caseType = tds[5].text
            if caseType == "CRSCA" or caseType == "CROVA":
                links.append("http://casesearch.courts.state.md.us/casesearch/" + tds[0].find("a")['href'])
                caseId = tds[0].find("a").text
                caseIds.append(caseId)
                names.append(tds[1].text)
                types.append(caseType)
                dates.append(tds[7].text)

        cases = pd.DataFrame(
            {'caseId': caseIds,
             'name': names,
             "type": types,
             "date": dates,
             "link" : links
            })
        
        print("Scraping Page " + str(page))
        page = page+1

    print("Done Scraping")
    return cases

#Post message on slack
def send_alert(row):
    print("send alert")
    charges = ""
    all_charges = getCharges(cookie, row["caseId"])
    charge_num = 1
    for c in all_charges:
        if charges == "":
            charges = "\n1) " + c
        else:
            charges = charges + " \n" + str(charge_num) + ") " + c
        charge_num = charge_num + 1
        
    message= row['name'] + " - " + row['date'] + charges + " \n" + row['link'] +" \ n-------------------------"
    slack_data = {'text': message}
    headers={'Content-Type': 'application/json'}
    url = slack_url
    r = requests.post(url, json=slack_data, headers=headers)

#Find new cases and post them on slack
def compare_cases(new_cases):
    print(len(new_cases))
    old_cases = pd.read_json('cases.json')
    
    print(str(len(new_cases)-len(old_cases)) + " New Cases")

    for index, row in new_cases.iterrows():

        if row["caseId"] not in old_cases['caseId'].unique():
            send_alert(row)

#     new_cases.to_json('cases.json')
  
#Run bot
def runBot():
    cookie = getCookie()
    cases = getCases(cookie)
    compare_cases(cases)

runBot()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Scraping Page 1
Scraping Page 2
Scraping Page 3
Scraping Page 4
Scraping Page 5
Scraping Page 6
Done Scraping
65
0 New Cases
