In [None]:
""" 
code to scrape list of large public company bankruptcies from web page: 
"https://lopucki.law.ucla.edu/spreadsheet.htm"

the UCLA School of Law Bankruptcy Research Databasee

where "large"  includes companies with reported assets over $100MM (in 1980 dollars)

will re-visit amending this list to include all public company bankruptcies 2014-2020
"""

In [6]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd


In [7]:
url = 'https://lopucki.law.ucla.edu/spreadsheet.htm' 

response = requests.get(url)

In [8]:
response.status_code  #200 = success!

200

In [9]:
response.text[:1000]  #First 1000 characters of the HTML

'\r\n\t<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r\n\t<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US">\r\n\t<head profile="http://www.w3.org/2000/08/w3c-synd/#">\r\n\t\t<meta name="google-site-verification" content="ENvYTWN0tghPS-wK9HwpN4ctD30nq95ucnmAGb_bhdc" />\r\n\t\t<meta http-equiv="content-type" content="text/html; charset=utf-8" />\r\n\t\t<meta http-equiv="language" content="english" />\r\n\t\t<meta http-equiv="dialect" content="us" />\r\n\t\t<meta http-equiv="imagetoolbar" content="no" />\r\n\t\t<meta name="robots" content="all" />\r\n\t\t<meta name="author" content="Lynn M. LoPucki" />\r\n\t\t<meta name="owner" content="UCLA School of Law" />\r\n\t\t<meta name="description" content="BRD Spreadsheet | UCLA-LoPucki Bankruptcy Research Database (BRD)" />\r\n\t\t<meta name="copyright" content="Copyright 2005-2018 Lynn M. LoPucki and UCLA School of Law" />\r\n\t\t<title>UCLA-LoPucki Bankrup

In [10]:
page = response.text

soup = BeautifulSoup(page)

In [15]:
table = soup.find(id='xyscrolling').find_all('td')

In [34]:
#table[:250]

In [17]:
contents_table = [x.text.strip() for x in table]

In [19]:
len(contents_table)

30200

In [27]:
data_array = np.array(contents_table).reshape(1208, 25)

In [28]:
data_array[1]

array(['SleepMaster LLC', 'D: Manufacturing', '25 Furniture And Fixtures',
       'Not tort', '11/16/2001', 'confirmed', 'no 363 sale', '433', 'NJ',
       'DE', 'Wilmington', 'Forum Shop', 'debtors', 'free fall', '1700',
       'company did not emerge', 'not emerged 5 years', 'NJ Newark',
       '1/23/2003', 'c $500 million but less than $1billion',
       'c 1,000 but less than 10,000', '446795134.036077', '636',
       'Chapter 11', 'Pachulski Stang'], dtype='<U78')

In [29]:
brd_list = pd.DataFrame(data_array)
brd_list.columns = ['debtor_name', 'industry_division', 'industry_group',
                    'cause', 'date_filed', 'disposition', '_363_sale',
                   'duration', 'inc_state', 'district_filed', 'city_filed',
                   'forum_shopping', 'filing_party', 'plan_type', 'employees',
                   'business_survival', 'refiled', 'hq_city', 'date_disposed',
                   'asset_size', 'employee_size', 'annual_sales', 'assets_$MM',
                   'chapter_filing', 'debtor_attorney']

In [35]:
brd_list.head()

Unnamed: 0,debtor_name,industry_division,industry_group,cause,date_filed,disposition,_363_sale,duration,inc_state,district_filed,...,business_survival,refiled,hq_city,date_disposed,asset_size,employee_size,annual_sales,assets_$MM,chapter_filing,debtor_attorney
0,"Bush Industries, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,3/31/2004,confirmed,no 363 sale,212,DE,NY WD,...,Company emerged,"emerged 5 years, did not refile",NY Buffalo,10/29/2004,d Less than $500 million,"c 1,000 but less than 10,000",472578228.388474,431,Chapter 11,Hodgson Russ
1,SleepMaster LLC,D: Manufacturing,25 Furniture And Fixtures,Not tort,11/16/2001,confirmed,no 363 sale,433,NJ,DE,...,company did not emerge,not emerged 5 years,NJ Newark,1/23/2003,c $500 million but less than $1billion,"c 1,000 but less than 10,000",446795134.036077,636,Chapter 11,Pachulski Stang
2,"Panolam Industries International, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,11/4/2009,confirmed,no 363 sale,36,DE,DE,...,Company emerged,"emerged 5 years, did not refile",CT Bridgeport,12/10/2009,d Less than $500 million,"c 1,000 but less than 10,000",441804625.652174,497,Chapter 11,Weil Gotshal
3,"Falcon Products, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,1/31/2005,confirmed,no 363 sale,259,DE,MO ED,...,Company emerged,"emerged 5 years, did not refile",MO St. Louis,10/17/2005,d Less than $500 million,"c 1,000 but less than 10,000",344202663.869953,365,Chapter 11,Thompson Coburn
4,Gaylord Container Corp,D: Manufacturing,26 Paper and Allied Products,Not tort,9/11/1992,confirmed,no 363 sale,35,DE,LA ED,...,Company emerged,"emerged 5 years, did not refile",IL Chicago,10/16/1992,b $1 billion but less than $10 billion,"c 1,000 but less than 10,000",1334261013.44657,1780,Chapter 11,Kirkland Ellis


In [36]:
brd_list.shape

(1208, 25)

In [37]:
# force the couple date columns to be in date format for easy querying later
brd_list['date_filed']= pd.to_datetime(brd_list['date_filed'])
brd_list['date_disposed']= pd.to_datetime(brd_list['date_disposed'])

In [38]:
brd_list.head()

Unnamed: 0,debtor_name,industry_division,industry_group,cause,date_filed,disposition,_363_sale,duration,inc_state,district_filed,...,business_survival,refiled,hq_city,date_disposed,asset_size,employee_size,annual_sales,assets_$MM,chapter_filing,debtor_attorney
0,"Bush Industries, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2004-03-31,confirmed,no 363 sale,212,DE,NY WD,...,Company emerged,"emerged 5 years, did not refile",NY Buffalo,2004-10-29,d Less than $500 million,"c 1,000 but less than 10,000",472578228.388474,431,Chapter 11,Hodgson Russ
1,SleepMaster LLC,D: Manufacturing,25 Furniture And Fixtures,Not tort,2001-11-16,confirmed,no 363 sale,433,NJ,DE,...,company did not emerge,not emerged 5 years,NJ Newark,2003-01-23,c $500 million but less than $1billion,"c 1,000 but less than 10,000",446795134.036077,636,Chapter 11,Pachulski Stang
2,"Panolam Industries International, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2009-11-04,confirmed,no 363 sale,36,DE,DE,...,Company emerged,"emerged 5 years, did not refile",CT Bridgeport,2009-12-10,d Less than $500 million,"c 1,000 but less than 10,000",441804625.652174,497,Chapter 11,Weil Gotshal
3,"Falcon Products, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2005-01-31,confirmed,no 363 sale,259,DE,MO ED,...,Company emerged,"emerged 5 years, did not refile",MO St. Louis,2005-10-17,d Less than $500 million,"c 1,000 but less than 10,000",344202663.869953,365,Chapter 11,Thompson Coburn
4,Gaylord Container Corp,D: Manufacturing,26 Paper and Allied Products,Not tort,1992-09-11,confirmed,no 363 sale,35,DE,LA ED,...,Company emerged,"emerged 5 years, did not refile",IL Chicago,1992-10-16,b $1 billion but less than $10 billion,"c 1,000 but less than 10,000",1334261013.44657,1780,Chapter 11,Kirkland Ellis


In [39]:
## add table to postgres db
from sqlalchemy import create_engine

engine = create_engine('postgresql://amybutler:localhost@localhost:5432/bankruptcy')

brd_list.to_sql('ucla_brd_list', engine, index=False)

In [40]:
# check table saved and can query
query = 'SELECT * FROM ucla_brd_list;'
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,debtor_name,industry_division,industry_group,cause,date_filed,disposition,_363_sale,duration,inc_state,district_filed,...,business_survival,refiled,hq_city,date_disposed,asset_size,employee_size,annual_sales,assets_$MM,chapter_filing,debtor_attorney
0,"Bush Industries, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2004-03-31,confirmed,no 363 sale,212,DE,NY WD,...,Company emerged,"emerged 5 years, did not refile",NY Buffalo,2004-10-29,d Less than $500 million,"c 1,000 but less than 10,000",472578228.388474,431,Chapter 11,Hodgson Russ
1,SleepMaster LLC,D: Manufacturing,25 Furniture And Fixtures,Not tort,2001-11-16,confirmed,no 363 sale,433,NJ,DE,...,company did not emerge,not emerged 5 years,NJ Newark,2003-01-23,c $500 million but less than $1billion,"c 1,000 but less than 10,000",446795134.036077,636,Chapter 11,Pachulski Stang
2,"Panolam Industries International, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2009-11-04,confirmed,no 363 sale,36,DE,DE,...,Company emerged,"emerged 5 years, did not refile",CT Bridgeport,2009-12-10,d Less than $500 million,"c 1,000 but less than 10,000",441804625.652174,497,Chapter 11,Weil Gotshal
3,"Falcon Products, Inc.",D: Manufacturing,25 Furniture And Fixtures,Not tort,2005-01-31,confirmed,no 363 sale,259,DE,MO ED,...,Company emerged,"emerged 5 years, did not refile",MO St. Louis,2005-10-17,d Less than $500 million,"c 1,000 but less than 10,000",344202663.869953,365,Chapter 11,Thompson Coburn
4,Gaylord Container Corp,D: Manufacturing,26 Paper and Allied Products,Not tort,1992-09-11,confirmed,no 363 sale,35,DE,LA ED,...,Company emerged,"emerged 5 years, did not refile",IL Chicago,1992-10-16,b $1 billion but less than $10 billion,"c 1,000 but less than 10,000",1334261013.44657,1780,Chapter 11,Kirkland Ellis
