In [None]:
"""

EDA of TRAINING and VALIDATION DATA
SEC filings 2014-2018

Downloaded from Google Cloud and saved in local Postgres database

- amount of missing data points in SEC data
- records reported in non-USD


"""

In [1]:
import psycopg2 as pg
import pandas.io.sql as pd_sql
import pandas as pd
import numpy as np
import datetime


In [2]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   

In [3]:
# few fields pulled

query = """
SELECT distinct measure_tag
FROM sec_all_raw
WHERE  period_end_date < '2019-01-01' 
    and units = 'USD'

;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()

Unnamed: 0,measure_tag
0,NetIncomeLoss
1,RepaymentsOfLongTermDebt
2,NetCashProvidedByUsedInInvestingActivities
3,DeferredIncomeTaxLiabilitiesNet
4,OperatingIncomeLoss


In [4]:
len(sec_data)

34

In [5]:
sec_data.tail()

Unnamed: 0,measure_tag
29,LiabilitiesAndStockholdersEquity
30,Revenues
31,EarningsPerShareBasic
32,NetCashProvidedByUsedInOperatingActivities
33,GrossProfit


In [6]:
all_columns = sec_data.transpose()



In [7]:
all_columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
measure_tag,NetIncomeLoss,RepaymentsOfLongTermDebt,NetCashProvidedByUsedInInvestingActivities,DeferredIncomeTaxLiabilitiesNet,OperatingIncomeLoss,LiabilitiesCurrent,TotalAsset,LongTermDebtMaturitiesRepaymentsOfPrincipalInN...,LongTermDebt,Assets,...,DeferredIncomeTaxExpenseBenefit,NetCashProvidedByUsedInFinancingActivities,Liabilities,CommonStockValue,LongTermDebtNoncurrent,LiabilitiesAndStockholdersEquity,Revenues,EarningsPerShareBasic,NetCashProvidedByUsedInOperatingActivities,GrossProfit


In [None]:
#########

In [8]:
# Which companies are submitting filings with non-USD values

query = """
SELECT distinct company_name, units
FROM sec_all_raw
WHERE  period_end_date < '2019-01-01' 
    and units != 'USD'

;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()


Unnamed: 0,company_name,units
0,ALGAE DYNAMICS CORP.,CAD
1,"ALTIMMUNE, INC.",EUR
2,AMC NETWORKS INC.,EUR
3,AMERICAN GRAPHITE TECHNOLOGIES INC.,CAD
4,BI-OPTIC VENTURES INC,CAD


In [9]:
len(sec_data)

68

In [10]:
sec_data.tail()

Unnamed: 0,company_name,units
63,UNIVERSAL BIOSENSORS INC,AUD
64,VISCOUNT SYSTEMS INC,CAD
65,VYCOR MEDICAL INC,EUR
66,WABCO HOLDINGS INC.,EUR
67,"YOSEN GROUP, INC.",CNY


In [None]:
## After researching, found non-U.S. public companies can and do file the 10-K
## Though supposed to use a different SEC form 20-F
## there doesn't seem to be a requirement that they not use the 10-K

## will remove the companies with non-USD units from data set

In [None]:
#################

In [11]:
#Revenue can be in a few different fields, may need to pull more data from Google Cloud

query = """
SELECT company_name, period_end_date, measure_tag, value
FROM sec_all_raw
WHERE  period_end_date < '2019-01-01' 
    and units = 'USD'
    and measure_tag in ('ProfitLoss', 'Revenues', 'SalesRevenueNet', 'Assets', 'GrossProfit')

;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()

Unnamed: 0,company_name,period_end_date,measure_tag,value
0,"ADAMS RESOURCES & ENERGY, INC.",2011-12-31,Assets,378840000.0
1,"ADAMS RESOURCES & ENERGY, INC.",2012-12-31,Assets,419501000.0
2,"ADAMS RESOURCES & ENERGY, INC.",2013-12-31,Assets,448082000.0
3,"ADAMS RESOURCES & ENERGY, INC.",2011-12-31,Revenues,3038737000.0
4,"ADAMS RESOURCES & ENERGY, INC.",2012-03-31,Revenues,875905000.0


In [12]:
pivoted = sec_data.pivot_table(index=['company_name', 'period_end_date'], columns= 'measure_tag',
                                values='value').reset_index()

In [13]:
over_100 = pivoted[pivoted['Assets']>= 100000000]

In [14]:
len(over_100)

28509

In [15]:
over_100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28509 entries, 4 to 110987
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   company_name     28509 non-null  object        
 1   period_end_date  28509 non-null  datetime64[ns]
 2   Assets           28509 non-null  float64       
 3   GrossProfit      11284 non-null  float64       
 4   ProfitLoss       13700 non-null  float64       
 5   Revenues         13871 non-null  float64       
 6   SalesRevenueNet  8204 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 1.7+ MB


In [16]:
over_100[over_100['company_name'] == 'ALPHABET INC.']

measure_tag,company_name,period_end_date,Assets,GrossProfit,ProfitLoss,Revenues,SalesRevenueNet
4639,ALPHABET INC.,2016-12-31,167497000000.0,,,90272000000.0,
4640,ALPHABET INC.,2017-12-31,197295000000.0,,,110855000000.0,
4641,ALPHABET INC.,2018-12-31,232792000000.0,,,,
4642,ALPHABET INC.,2014-12-31,129187000000.0,,,66001000000.0,
4643,ALPHABET INC.,2015-12-31,147461000000.0,,,74989000000.0,


In [17]:
over_100[over_100['company_name'] == 'AMAZON COM INC']

measure_tag,company_name,period_end_date,Assets,GrossProfit,ProfitLoss,Revenues,SalesRevenueNet
5106,AMAZON COM INC,2016-12-31,83402000000.0,,,,89864000000.0
5113,AMAZON COM INC,2017-12-31,131310000000.0,,,,119159500000.0
5114,AMAZON COM INC,2018-12-31,162648000000.0,,,,
5115,AMAZON COM INC,2014-12-31,54505000000.0,,,,65124000000.0
5116,AMAZON COM INC,2015-12-31,64979330000.0,,,,78502400000.0
5118,AMAZON COM INC,2013-12-31,40159000000.0,,,,54906000000.0
5120,AMAZON COM INC,2012-12-31,32555000000.0,,,,47818000000.0


In [18]:
over_100[(over_100['Revenues'].isna() ) & (over_100['SalesRevenueNet'].isna() ) ]

measure_tag,company_name,period_end_date,Assets,GrossProfit,ProfitLoss,Revenues,SalesRevenueNet
21,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2018-12-31,1.479230e+08,,,,
33,"1895 BANCORP OF WISCONSIN, INC.",2017-12-31,4.683610e+08,,1659000.0,,
34,"1895 BANCORP OF WISCONSIN, INC.",2018-12-31,4.810990e+08,,-19000.0,,
38,"1ST CENTURY BANCSHARES, INC.",2014-12-31,5.852180e+08,,,,
39,"1ST CENTURY BANCSHARES, INC.",2015-12-31,7.319500e+08,,,,
...,...,...,...,...,...,...,...
110930,"ZULILY, INC.",2014-12-31,4.923780e+08,212756500.0,,,
110932,"ZULILY, INC.",2013-12-31,3.560870e+08,130883000.0,,,
110945,ZYMEWORKS INC.,2017-12-31,1.319550e+08,,-10406000.0,,
110946,ZYMEWORKS INC.,2018-12-31,2.443630e+08,,-36556000.0,,


In [None]:
###################

In [19]:
# Examine companies with many blank fields

query = """
SELECT *
FROM sec_wide_all
WHERE  period_end_date < '2019-01-01' 
;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()

Unnamed: 0,company_name,period_end_date,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,date_filed,...,OperatingIncomeLoss,ProfitLoss,RepaymentsOfLongTermDebt,Revenues,SalesRevenueNet,StockholdersEquity,TotalAsset,Industry,div_code,Division
0,"'MKTG, INC.'",2014-03-31 00:00:00.000000,0001019056-14-000881,886475,61340408,7310,331,2013,10-K,2014-06-27 00:00:00.000000,...,2830011.0,2293272.0,,,,12190322.0,,Services-Advertising,73,Services
1,"024 PHARMA, INC.",2015-12-31 00:00:00.000000,0001683168-17-000653,1307969,201862731,3089,1231,2016,10-K,2017-03-24 00:00:00.000000,...,,,,,,137994.0,,"Plastics Products, NEC",30,Manufacturing
2,"024 PHARMA, INC.",2016-12-31 00:00:00.000000,0001683168-17-000653,1307969,201862731,3089,1231,2016,10-K,2017-03-24 00:00:00.000000,...,,,,1079541.0,,759035.0,,"Plastics Products, NEC",30,Manufacturing
3,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0001437749-14-016921,1084869,113117311,5990,630,2014,10-K,2014-09-12 00:00:00.000000,...,23706000.0,14675000.0,,,756345000.0,183199000.0,,"Retail-Retail Stores, NEC",59,Retail Trade
4,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0001437749-15-017184,1084869,113117311,5990,630,2015,10-K,2015-09-11 00:00:00.000000,...,23706000.0,14675000.0,,,756345000.0,183228000.0,,"Retail-Retail Stores, NEC",59,Retail Trade


In [22]:
sec_data.columns

Index(['company_name', 'period_end_date', 'submission_number',
       'central_index_key', 'ein', 'sic', 'fiscal_year_end', 'fiscal_year',
       'form', 'date_filed', 'Assets', 'AssetsCurrent',
       'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'GrossProfit',
       'Liabilities', 'LiabilitiesAndStockholdersEquity', 'LiabilitiesCurrent',
       'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss

In [23]:
sec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173062 entries, 0 to 173061
Data columns (total 47 columns):
 #   Column                                                         Non-Null Count   Dtype  
---  ------                                                         --------------   -----  
 0   company_name                                                   173062 non-null  object 
 1   period_end_date                                                173062 non-null  object 
 2   submission_number                                              173062 non-null  object 
 3   central_index_key                                              173062 non-null  object 
 4   ein                                                            173062 non-null  object 
 5   sic                                                            173062 non-null  object 
 6   fiscal_year_end                                                173062 non-null  int64  
 7   fiscal_year                                    

In [24]:
# look at companies who don't have "Assets" filled in, determine if company using another field
sec_data[sec_data['Assets'].isna()].company_name.unique()

array(['024 PHARMA, INC.', '1 800 FLOWERS COM INC', '12 RETECH CORP', ...,
       'ZYNEX INC', 'ZYNGA INC', 'ZZLL INFORMATION TECHNOLOGY, INC'],
      dtype=object)

In [None]:
examine_one = sec_data[sec_data['company_name']=='ZYNGA INC'].sort_values(by='period_end_date')
examine_one.to_excel('examine_one_co.xlsx', index=False)

# ZYNGA keeps re-submitting the annual report for a previous end_date.  Not different data, 
# just less data than originally submitted

In [None]:
# will examine another company
examine_two = sec_data[sec_data['company_name']=='ZZLL INFORMATION TECHNOLOGY, INC'].sort_values(by='period_end_date')
examine_two.to_excel('examine_two_co.xlsx', index=False)

## ZZLL submitted updated information

In [None]:
# 12 RETECH CORP

examine_3 = sec_data[sec_data['company_name']=='12 RETECH CORP'].sort_values(by='period_end_date')
examine_3.to_excel('examine_3_co.xlsx', index=False)

# resubmitting prior period_end the following year but not submitting new informaton.
