In [None]:
"""
filter BRD list to date_filed between 1/1/2014 - 12/31/2020

Found many EIN missing from SEC data, so prepared list of bankruptcies
to have the same name as in the company_name field of SEC data

Add labeling of target field (bankruptcy w/in 1 year of annual report = 1 else 0)


"""

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime

In [2]:
from pandasql import sqldf

# PandaSQL needs to be able to reference the global variables already defined (namely, `df`)
pysqldf = lambda q: sqldf(q, globals())

In [3]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   

In [4]:
query = """
SELECT debtor_name, date_filed, chapter_filing 
FROM ucla_brd_list 
WHERE (date_filed >= '2015-01-01') and (date_filed  <= '2020-12-31')
;"""

brd_query =  pd_sql.read_sql(query, connection)

brd_query.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing
0,First Mariner Bancorp,2014-02-10,Chapter 11
1,Dolan Company,2014-03-23,Chapter 11
2,"Global Geophysical Services, Inc.",2014-03-25,Chapter 11
3,USEC Inc.,2014-03-05,Chapter 11
4,Coldwater Creek Inc.,2014-04-11,Chapter 11


In [5]:
brd_2014.shape

(215, 3)

In [6]:
query_sec = """
SELECT distinct company_name, ein
FROM sec_all_usd
"""
companies = pd_sql.read_sql(query_sec, connection)

companies.head()

Unnamed: 0,company_name,ein
0,NORTHSTAR ELECTRONICS INC,330803434
1,"MISTER GOODY, INC.",275414480
2,"LEVELBLOX, INC.",263748249
3,"THT HEAT TRANSFER TECHNOLOGY, INC.",205463509
4,"NUO THERAPEUTICS, INC.",233011702


In [7]:
query_one = """
SELECT *
FROM sec_all_usd
WHERE company_name = 'USEC INC';
"""

one_query = pd_sql.read_sql(query_one, connection)

one_query.head()

Unnamed: 0,submission_number,company_name,measure_tag,period_end_date,value,units,number_of_quarters,version,central_index_key,ein,sic,fiscal_year_end,form,fiscal_year,fiscal_period_focus,date_filed,date_accepted
0,0001065059-14-000016,USEC INC,Assets,2011-12-31,3549300000.0,USD,0,us-gaap/2013,1065059,522107911,1400,1231,10-K,2013,FY,2014-03-31,2014-03-31 12:18:00 UTC
1,0001065059-14-000016,USEC INC,DeferredIncomeTaxesAndTaxCredits,2012-12-31,0.0,USD,4,us-gaap/2013,1065059,522107911,1400,1231,10-K,2013,FY,2014-03-31,2014-03-31 12:18:00 UTC
2,0001065059-14-000016,USEC INC,DeferredIncomeTaxesAndTaxCredits,2013-12-31,0.0,USD,4,us-gaap/2013,1065059,522107911,1400,1231,10-K,2013,FY,2014-03-31,2014-03-31 12:18:00 UTC
3,0001065059-14-000016,USEC INC,Depreciation,2011-12-31,42700000.0,USD,4,us-gaap/2013,1065059,522107911,1400,1231,10-K,2013,FY,2014-03-31,2014-03-31 12:18:00 UTC
4,0001065059-14-000016,USEC INC,Depreciation,2012-12-31,27000000.0,USD,4,us-gaap/2013,1065059,522107911,1400,1231,10-K,2013,FY,2014-03-31,2014-03-31 12:18:00 UTC


In [8]:
## first try join after uppercase debtor_name in brd list

join_try1 = """
SELECT distinct a.debtor_name, a.date_filed, a.chapter_filing, b.ein
FROM ucla_brd_list as A
    LEFT JOIN sec_all_usd as B
ON UPPER(A.debtor_name) = UPPER(B.company_name)
WHERE a.date_filed >= '2014-01-01'
;"""

t1_join = pd_sql.read_sql(join_try1, connection)
t1_join.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,ein
0,"Cal Dive International, Inc.",2015-03-03,Chapter 11,611500501.0
1,"EV Energy Partners, L.P.",2018-04-02,Chapter 11,
2,SquareTwo Financial Corporation,2017-03-19,Chapter 11,
3,Patriot Coal Corporation (2015),2015-05-12,Chapter 11,
4,RCS Capital Corporation,2016-01-31,Chapter 11,


In [9]:
t1_join.shape

(231, 4)

In [10]:
t1_join[t1_join['ein'].notna()].count()

debtor_name       78
date_filed        78
chapter_filing    78
ein               78
dtype: int64

In [11]:
t1_join[t1_join['ein'].isna()].count()

debtor_name       153
date_filed        153
chapter_filing    153
ein                 0
dtype: int64

In [12]:
## export t1_join and look up eins

t1_join.to_excel('debtor_list_ein_review.xlsx', index=False)

In [13]:
# Query db table to find EINS 
# This query was run mulitple times to fill in missing EINS
# where not name match exactly between the two data sources

query_co = """
SELECT distinct ein, company_name
FROM sec_all_usd
WHERE lower(company_name) LIKE '%melinta%'
;
"""

find_ein = pd_sql.read_sql(query_co, connection)

find_ein

Unnamed: 0,ein,company_name
0,454440364,"MELINTA THERAPEUTICS, INC. /NEW/"
