# Massachusetts Eviction Data and Housing Court Statistics

In [18]:
# import code block
from selenium import webdriver
from bs4 import BeautifulSoup
import zipfile
import pandas as pd
from io import StringIO
import requests
import csv
import time
import random
import matplotlib.pyplot as plt
from urllib.request import urlopen
import re

## Get all the links for dates and store them in a list

In [4]:
html = urlopen('https://masslandlords.net/policy/eviction-data/?ct=t%28Event+Webinar+2021-01-22+T-21+v1%29')
bs = BeautifulSoup(html, 'html.parser')

# collect all of the dates urls into one list
urls = bs.find_all('a',{'href':re.compile('https://masslandlords.net/policy/eviction-data/filings-week-ending-*')})

# clean urls of unnecessary attribute data - only need href
urls = [url['href'] for url in urls]

# we want the urls for the first 6 weeks of data, let's verify we have them by printing them out
# Should be weeks 10/24 to 11/28 i n2020
print(urls[0:6])

['https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-10-24/', 'https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-10-31/', 'https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-11-07/', 'https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-11-14/', 'https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-11-21/', 'https://masslandlords.net/policy/eviction-data/filings-week-ending-2020-11-28/']


## Define some functions so we don't have to rewrite code to scrape each week data

In [33]:
def getSoup(url):
    """ param: url of site to get soup for. Returns soup (AKA text) for site url"""
    html = urlopen(url)
    bs = BeautifulSoup(html, 'html.parser')
    soup = bs.find('section',id='main-content').find('p', class_='monospace').get_text()
    return soup

def getSections(soup):
    """param: soup/text for url. Returns text divided into sections for easier conversion to tables"""
    # page is split into 5 sections separted by '--'
    a,b,c,d,e,f =soup.split('--')
    # section c has many different tables within, not split by '--' but rather by newlines. 
    # we'll clean it up using a regular expression (regex) and then split it into 11 sections to better disect each
    # table individually 
    # remove beginning and ending newlines
    reg = re.compile('^\r\n')
    c = reg.sub('',c,count=1)
    reg2 = re.compile('\r\n\r\n\r\n$')
    c=reg2.sub('',c,count=1)
    c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11 = c.split("\r\n\r\n\r\n")
    # we'll return all these variables
    return a,b,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,d,e,f

def convertToDf(htmlTable,cNumSection):
    """param: html table (aka one of the sections divided aboce), and a cNumSection param which is an int indicating
       which section of c (if any) we are converting. This helps us define our indexes for our table.
       returns table as pandas dataframe 
    """
    # set default start values
    startIndex = 0
    columnIndex = 0
    
    if cNumSection in [1,2,4,10,11]:
        startIndex = 2
        columnIndex = 1
    elif cNumSection in [3,5,8]:
        startIndex = 2
        columnIndex = 0
    elif cNumSection in [7,9]:
        startIndex = 1
        columnIndex = 0
    elif cNumSection == 6:
        startIndex = 4
        columnIndex = 3
    else:
        startIndex = 0
        columnIndex = 0
    
    # remove newline at the beginning of the table "block"
    reg = re.compile('^\r\n')
    htmlTable = reg.sub('',htmlTable,count=1)

    # split the table into separate columns
    htmlTable = htmlTable.split('\r\n')
    for index in range(0,len(htmlTable)):
        htmlTable[index] = re.split('  +',htmlTable[index])

    # load the data into a dataframe
    df = pd.DataFrame(htmlTable[startIndex:],columns=htmlTable[columnIndex])
    return df

## Get data for week of 10/24/2020

In [15]:
soup = getSoup(urls[0])
print(soup)



Residential summary process: Filings Report
This report examines cases recently filed, for which outcomes remain largely unknown.
Search Period Start: 2020-10-18
Search Period End:   2020-10-24
Earliest Case:       2020-10-19
Latest Case:         2020-10-23
Total Cases:                 49
Total Transfers:              7

--
High-level take-aways:
Percentage of landlords for whom attorney is optional: 65.3%
Of those, percentage pro se:                           62.5%

Most common cause:                               Cause
Least stable municipality/neighborhood: Vineyard Haven
Least stable with 10+ filings:                     n/a
Least stable with 100+ filings:                    n/a
Least stable county:                             Dukes

The number of filings this period is statistically significantly below the pre-pandemic housing crisis baseline.
--

Courts
(n)                   Count   Percent
northeast                16     32.7%
southeast             

In [16]:
a,b,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,d,e,f = getSections(soup)
print(c1)


Courts
(n)                   Count   Percent
northeast                16     32.7%
southeast                 8     16.3%
eastern                   6     12.2%
central                   6     12.2%
metro_south               6     12.2%
western                   5     10.2%
somerville district       1      2.0%
taunton district          1      2.0%


In [38]:
courts = convertToDf(c1,1)
partyTypeNum = convertToDf(c2,2)
partyTypePercent = convertToDf(c3,3)
plaintiffRepNum = convertToDf(c4,4)
plaintiffRepPercent = convertToDf(c5,5)
defendantRepNum = convertToDf(c6,6)
defendantRepPercent = convertToDf(c7,7)
numAdultsHouseholds = convertToDf(c8,8)
initiatingAction = convertToDf(c9,9)
ratePer100k = convertToDf(c10, 10)
ratePer100kRenters = convertToDf(c11,11)

# display data frames
display(courts)
display(partyTypeNum)
display(partyTypePercent)
display(plaintiffRepNum)
display(plaintiffRepPercent)
display(defendantRepNum)
display(defendantRepPercent)
display(numAdultsHouseholds)
display(initiatingAction)
display(ratePer100k)
display(ratePer100kRenters)


Unnamed: 0,(n),Count,Percent
0,northeast,16,32.7%
1,southeast,8,16.3%
2,eastern,6,12.2%
3,central,6,12.2%
4,metro_south,6,12.2%
5,western,5,10.2%
6,somerville district,1,2.0%
7,taunton district,1,2.0%


Unnamed: 0,(n),Plaintiffs,Defendants
0,Corporate Entity,17,0
1,Natural Person,32,49
2,Total,49,49


Unnamed: 0,(%),Plaintiffs,Defendants
0,Natural Person,65.3%,100.0%
1,Total,100.0%,100.0%


Unnamed: 0,(n),Has Attorney,Pro Se,Total
0,Required,16,1,17
1,Optional,12,20,32
2,Total,28,21,49


Unnamed: 0,(%),Has Attorney,Pro Se,Total
0,Optional,24.5%,40.8%,65.3%
1,Total,57.1%,42.9%,100.0%


Unnamed: 0,Required,0,0.1,0.2
0,Optional,5,44,49
1,Total,5,44,49


Unnamed: 0,(%),Has Attorney,Pro Se,Total
0,Required,0.0%,0.0%,0.0%
1,Optional,10.2%,89.8%,100.0%
2,Total,10.2%,89.8%,100.0%


Unnamed: 0,Number of Adults in Households,Count,Percent
0,2,10,20.4%
1,3,3,6.1%
2,Total,49,100.0%


Unnamed: 0,Initiating Action,Count,Percent
0,Cause,21,42.9%
1,Non-payment,18,36.7%
2,No Fault,5,10.2%
3,Foreclosure,3,6.1%
4,Other,2,4.1%


Unnamed: 0,Municipality,Residents,Count
0,Vineyard Haven,52.0,1.0
1,Osterville,28.0,1.0
2,East Falmouth,17.0,1.0
3,Topsfield,16.0,1.0
4,Shirley,13.0,1.0
5,Southbridge,11.0,2.0
6,Amesbury,6.0,1.0
7,Brockton,5.0,5.0
8,Newburyport,5.0,1.0
9,Pembroke,5.0,1.0


Unnamed: 0,County,Renter Households,Count
0,Dukes,24.0,1.0
1,Plymouth,20.0,9.0
2,Berkshire,12.0,2.0
3,Essex,11.0,13.0
4,Barnstable,10.0,2.0
5,Hampden,4.0,3.0
6,Middlesex,3.0,7.0
7,Worcester,3.0,4.0
8,Bristol,2.0,2.0
9,Suffolk,2.0,4.0


## TODO: Repeat above steps for all weeks and convert to csv