In [1]:
"""load downloaded CSV files from SEC database
in Google Public Cloud
into postgres db created in local drive"""

'load downloaded CSV files into postgres db created in local drive'

In [1]:
from sqlalchemy import create_engine
import pandas as pd


In [2]:
# local db in postgres is named 'bankrupcy'

engine = create_engine('postgresql://amybutler:localhost@localhost:5432/bankruptcy')

In [6]:
## add table with list of industry codes

sic_codes = pd.read_csv('../data/SIC_codes.csv', dtype={'SIC Code':'str'})

sic_codes.to_sql('sic_codes', engine, index=False)

In [7]:
query = 'SELECT * FROM sic_codes;'
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,SIC Code,Industry,Unnamed: 2
0,100,Agricultural Production-Crops,
1,200,Agricultural Prod-Livestock & Animal Specialties,
2,700,Agricultural Services,
3,800,Forestry,
4,900,"Fishing, Hunting and Trapping",


In [8]:
# add table with list of industry divisions


sic_divisions = pd.read_csv('../data/SIC_divisions.csv', dtype={'div_code':'str'})

sic_divisions.to_sql('sic_divisions', engine, index=False)

In [9]:
query = 'SELECT * FROM sic_divisions;'
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,div_code,Division
0,1,"Agriculture, Forestry and Fishing"
1,2,"Agriculture, Forestry and Fishing"
2,3,"Agriculture, Forestry and Fishing"
3,4,"Agriculture, Forestry and Fishing"
4,5,"Agriculture, Forestry and Fishing"


In [12]:
# CSV's downloaded from querying the Google Cloud Platform SEC Public Dataset 
# SEC is U.S. Securities and Exchange Commission

file1 = '../data/2014_SEC_10K_raw.csv'
file2 = '../data/2015_SEC_10K_raw.csv'
file3 = '../data/2016_SEC_10K_raw.csv'
file4 = '../data/2017_SEC_10K_raw.csv'
file5 = '../data/2018_SEC_10K_raw.csv'
file6 = '../data/2019_SEC_10K_raw.csv'

In [4]:
file1_df = pd.read_csv(file1, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )

In [13]:
file2_df = pd.read_csv(file2, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )

In [6]:
file3_df = pd.read_csv(file3, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )

In [7]:
file4_df = pd.read_csv(file4, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )


In [8]:
file5_df = pd.read_csv(file5, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )

In [9]:
file6_df = pd.read_csv(file6, dtype={'sic':  'str', 'ein':'str', 'central_index_key':'str'}, 
                        parse_dates=['period_end_date', 'date_filed'] )

In [14]:
print('file1: 2014', file1_df.shape)
print('file2: 2015', file2_df.shape)
print('file3: 2016', file3_df.shape)
print('file4: 2017', file4_df.shape)
print('file5: 2018', file5_df.shape)
print('file6: 2019', file6_df.shape)

file1: 2014 (448772, 17)
file2: 2015 (434124, 17)
file3: 2016 (414026, 17)
file4: 2017 (355284, 17)
file5: 2018 (188761, 17)
file6: 2019 (18236, 17)


In [None]:

# the difference in number of rows later determined to be caused by many companies
# keep re-sending their data to SEC when they send the most recent report

# also 2019 Q4 is not available in Google Cloud. Is available in a download file from SEC website. 
# will prepare those files separately when prepare test hold-out set

In [16]:
file1_df.to_sql('sec_raw_2014', engine, index=False)

In [17]:
file2_df.to_sql('sec_raw_2015', engine, index=False)
file3_df.to_sql('sec_raw_2016', engine, index=False)
file4_df.to_sql('sec_raw_2017', engine, index=False)
file5_df.to_sql('sec_raw_2018', engine, index=False)
file6_df.to_sql('sec_raw_2019', engine, index=False)

In [None]:

"""
In terminal, in psql connected to local postgres db, 
the 6 raw SEC data tables in postgres db were combined by:


CREATE TABLE sec_raw_all as 
SELECT * FROM sec_raw_2014
UNION
SELECT * FROM sec_raw_2015
UNION
SELECT * FROM sec_raw_2016
UNION
SELECT * FROM sec_raw_2017
UNION
SELECT * FROM sec_raw_2018
UNION
SELECT * FROM sec_raw_2019;


total rows = 1859118


"""