In [None]:
#This code scrapes the maryland case search database and send slack messages when new interesting cases are found. To be ran on a schedule.

In [2]:
#load needed libraries
import requests
import time
import json
import datetime
from datetime import timedelta
import pandas as pd
from bs4 import BeautifulSoup

In [33]:
#load in sensitive information from seperate key file
keys={}
with open("keys.json","r") as f:
    keys = json.loads(f.read())
   
    
slack_url = keys["url"]
codes = keys["codes"]
partyType = keys['partyType']
county = keys["county"]
site = keys['site']
company = keys['company']
courtSystem = keys["courtSystem"]

#Get an authenticated cookie for searches
def getCookie():
    session = requests.Session()
    url = 'http://casesearch.courts.state.md.us/casesearch/'
    params = {
            'disclaimer' : 'Y',
            'action' : 'Continue'
    }
    r = session.post(url, data=params)

    cook = session.cookies['JSESSIONID']

    time.sleep(1)

    return "JSESSIONID=" + cook
    
#search case search for single cases results by case number and return page
def getSingleCase(cookie, caseId):

    headers = {'Cookie': cookie}

    params = {
        'caseId' : caseId,
        'action': 'Get Case',
        'locationCode': 'B'
    }

    url = 'http://casesearch.courts.state.md.us/casesearch/inquiryByCaseNum.jis'
    r = requests.post(url, params=params, headers=headers)
    return r.text

#search casesearch for possible cases and return page
def getPage(cookie, page):
    headers = {'Cookie': cookie}

    today = datetime.datetime.today().strftime('%m/%d/%Y')
    yesterday = (datetime.datetime.today() - timedelta(1)).strftime('%m/%d/%Y')
    params = {
        'd-16544-p': page,
        'lastName': '%', 
        'firstName' : '',
        'middleName': '',  
        'partyType': partyType,
        'site': site,
        'courtSystem': courtSystem,
        'countyName': county,
        'filingStart': "1/22/2019",
        'filingEnd': "1/23/2019",
        'filingDate': '',
        'company': company,
        'action': 'Search',
    }

    url = 'http://casesearch.courts.state.md.us/casesearch/inquirySearch.jis'
    r = requests.post(url, params=params, headers=headers)
    time.sleep(1)
    return r.text

#Get charges for one individual cases
def getCharges(cookie, caseId):
    
    #data we will gather from individual case page
    charges = []
    cjiss = []
    text = getSingleCase(cookie, caseId)
    soup = BeautifulSoup(text)
    windows = soup.find_all("div", attrs={'class':'AltBodyWindow1'})
    for window in windows:
        tables = window.find_all("table")
        for table in tables:
            for row in table.findAll('tr'):
                cell = row.findNext('td')
                
                #get cjis number for each charge
                if cell.text == 'Charge No:':
                    target = cell.next_sibling
                    spans = target.find_all("span")
                    cjis = spans[2].text
                    cjiss.append(cjis)

                #get charge description for each charge
                if cell.text == 'Charge Description:':
                    target = cell.next_sibling
                    spans = target.find_all("span")
                    charge = spans[0].text              
                    charges.append(charge)
     
    charge_data = {"charge": charges, "cjis" : cjiss}
    return charge_data
    
#Run search and return information on all current cases  
def getCases(cookie):
    
    end_reached = False
    page = 1
    
    #data we will collect
    caseIds = []
    links = []
    names = []
    types = []
    dates = []
    
    #keep scraping until you have reached the last page of results
    while (end_reached == False):
        
        text = getPage(cookie, page)
        soup = BeautifulSoup(text)
        
        #test if last page reached
        banner = soup.find("span", attrs={'class':'pagebanner'}).text
        splits = banner.split(" ", 6)
        if (splits[0] == splits[6][:-1]):
            end_reached = True
        table = soup.find("table", attrs={'id':'row'})
        body = table.find("tbody")
        rows = body.find_all("tr")
        cases_on_page = len(rows)

        for row in rows:
            tds = row.find_all("td")
            caseType = tds[5].text
            if caseType == "CRSCA" or caseType == "CROVA":
                if (tds[0].find("a") != None):
                    links.append("http://casesearch.courts.state.md.us/casesearch/" + tds[0].find("a")['href'])
                    caseId = tds[0].find("a").text
                    caseIds.append(caseId)
                    names.append(tds[1].text)
                    types.append(caseType)
                    dates.append(tds[7].text)

        #create dataframe from gathered info
        cases = pd.DataFrame(
            {'caseId': caseIds,
             'name': names,
             "type": types,
             "date": dates,
             "link" : links
            })
        
        print("Scraping Page " + str(page))
        page = page+1

    print("Done Scraping")
    return cases

#Post message on slack if it is qualified. We check if the charges for each case are interesting here because it is time consuming
def send_alert(row, cookie):
    print("send alert")
    charges = ""
    charge_data = getCharges(cookie, row["caseId"])
    
    #build message text if qualified
    
    qualified = False
    for charge in charges:
        if charge in codes:
            qualified = True
            
    if qualified:
        charge_num = 1
        for c,j in zip(charge_data['charge'],charge_data['cjis']):
            if charges == "":
                charges = "\n1) " + c + " : " + j
            else:
                charges = charges + " \n" + str(charge_num) + ") " + c + " : " + j
            charge_num = charge_num + 1

        message = row['name'] + " - " + row['date'] + charges + " \n" + row['link'] +" \n-------------------------"
        slack_data = {'text': message}
        headers={'Content-Type': 'application/json'}
        url = slack_url
        r = requests.post(url, json=slack_data, headers=headers)

#Find new cases and post them on slack
def compare_cases(new_cases, cookie):
    
    #load cases from last search
    old_cases = pd.read_json('cases.json')
    
    print(str(len(new_cases)-len(old_cases)) + " New Cases")

    #see if any results are new and if they are post them on slack
    for index, row in new_cases.iterrows():
          if row["caseId"] not in old_cases['caseId'].unique():
            send_alert(row, cookie)

#     new_cases.to_json('cases.json')
  
#Run bot
def runBot():
    cookie = getCookie()
    cases = getCases(cookie)
    compare_cases(cases, cookie)

runBot()

75 items found, displaying 1 to 25.
Scraping Page 1
75 items found, displaying 26 to 50.
Scraping Page 2
75 items found, displaying 51 to 75.
Scraping Page 3
75 items found, displaying 51 to 75.
Scraping Page 4
75 items found, displaying 51 to 75.
Scraping Page 5
75 items found, displaying 51 to 75.
Scraping Page 6
75 items found, displaying 51 to 75.
Scraping Page 7
75 items found, displaying 51 to 75.
Scraping Page 8


KeyboardInterrupt: 

In [45]:
banner = "1001001 items found, displaying 51 to 1001001."
splits = banner.split(" ", 6)
print(splits)
if (splits[0] == splits[6][:-1]):
    print("done")

['1001001', 'items', 'found,', 'displaying', '51', 'to', '1001001.']
done
