# Converter for the Financial Filings at the Sec

In [121]:
from zipfile import ZipFile
import pandas as pd

with ZipFile('importFiles/2010q1.zip') as myzip:
    with myzip.open('num.txt') as myfile1:
        dfNum = pd.read_table(myfile1,delimiter="\t")
    with myzip.open('pre.txt') as myfile2:
        dfPre = pd.read_table(myfile2,delimiter="\t")
    with myzip.open('sub.txt') as myfile3:
        dfSub = pd.read_table(myfile3,delimiter="\t")
    with myzip.open('tag.txt') as myfile4:
        dfTag = pd.read_table(myfile4,delimiter="\t")

dfSym = pd.read_table('importFiles/ticker.txt', delimiter="\t", header=None, names=['symbol','cik'])        
        
print(dfNum.size)
print(dfPre.size)
print(dfSub.size)
print(dfTag.size)
print(dfSym.size)

1365228
883780
17820
100377
24168


In [122]:
from tabulate import tabulate

print(tabulate(dfNum.head(100), headers='keys', tablefmt='psql'))

+----+----------------------+---------------------------------------------+--------------+--------------------------------------------+----------+--------+-------+-------------+----------------------------+
|    | adsh                 | tag                                         | version      | coreg                                      |    ddate |   qtrs | uom   |       value | footnote                   |
|----+----------------------+---------------------------------------------+--------------+--------------------------------------------+----------+--------+-------+-------------+----------------------------|
|  0 | 0001104659-10-010697 | AccountsAndNotesReceivableNet               | us-gaap/2009 | nan                                        | 20090131 |      0 | USD   | 3.4754e+09  | nan                        |
|  1 | 0001104659-10-010697 | AccountsAndNotesReceivableNet               | us-gaap/2009 | nan                                        | 20091031 |      0 | USD   | 2.6169e+

In [123]:
print(tabulate(dfPre.head(100), headers='keys', tablefmt='psql'))

+----+----------------------+----------+--------+--------+---------+---------+---------------------------------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------+------------+
|    | adsh                 |   report |   line | stmt   |   inpth | rfile   | tag                                               | version      | plabel                                                                                                                               |   negating |
|----+----------------------+----------+--------+--------+---------+---------+---------------------------------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------+------------|
|  0 | 0000950123-10-018432 |        1 |     16 | BS     |       0 | X       | AccountsAndNotesReceivableNet          

In [124]:
print(tabulate(dfSub.head(100), headers='keys', tablefmt='psql'))

+----+----------------------+---------+--------------------------------------------+-------+-------------+----------+----------------------+------------+------------------------------------------+--------------------------------+--------------------+-------------+----------+--------------------+------------+------------------------------------------+--------------------------------+--------------+-----------+-----------+--------------------------------------+---------------+-------+--------+-------+--------+----------+------+------+----------+-----------------------+-----------+----------+-------------------+---------+--------------------------------------------------------------------------------------------------------------------------+
|    | adsh                 |     cik | name                                       |   sic | countryba   | stprba   | cityba               | zipba      | bas1                                     | bas2                           | baph               |

In [125]:
print(tabulate(dfTag.head(100), headers='keys', tablefmt='psql'))

+----+-------------------------------------------------------------------------------------------------------+--------------+----------+------------+------------+--------+--------+----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [126]:
print(tabulate(dfSym.head(10), headers='keys', tablefmt='psql'))

+----+----------+---------+
|    | symbol   |     cik |
|----+----------+---------|
|  0 | aapl     |  320193 |
|  1 | msft     |  789019 |
|  2 | brk-b    | 1067983 |
|  3 | unh      |  731766 |
|  4 | jnj      |  200406 |
|  5 | v        | 1403161 |
|  6 | tsm      | 1046179 |
|  7 | xom      |   34088 |
|  8 | wmt      |  104169 |
|  9 | spy      |  884394 |
+----+----------+---------+


In [127]:
print(dfNum.iloc[0])
print(dfNum.columns.values)

adsh                 0001104659-10-010697
tag         AccountsAndNotesReceivableNet
version                      us-gaap/2009
coreg                                 NaN
ddate                            20090131
qtrs                                    0
uom                                   USD
value                        3475400000.0
footnote                              NaN
Name: 0, dtype: object
['adsh' 'tag' 'version' 'coreg' 'ddate' 'qtrs' 'uom' 'value' 'footnote']


In [128]:
from datetime import date
from marshmallow import Schema, fields

class FinancialElementImportDto:
    label = ""          #Tag.doc
    concept = ""        #Num.tag
    info = ""           #Pre.plabel
    unit = ""           #Num.uom
    value = 0.0         #Num.value

class FinancialsDataDto:
    bs = []  #FinancialElementImportDto[] mapping key from Pre.stmt
    cf = []  #FinancialElementImportDto[] mapping key from Pre.stmt
    ic = []  #FinancialElementImportDto[] mapping key from Pre.stmt
    

class SymbolFinancialsDto:
    startDate = date.today()   #Sub.period 
    endDate = date.today()     #Sub.period + Sub.fp
    year = 0                   #Sub.fy
    quarter = ""               #Sub.fp
    symbol = ""                #Sub.cik -> Sym.cik -> Sym.symbol
    name = ""                  #Sub.name
    country = ""               #Sub.countryma
    city = ""                  #Sub.cityma
    data = FinancialsDataDto()
    
class FinancialElementImportSchema(Schema):
    label = fields.String()
    concept = fields.String()
    info = fields.String()
    unit = fields.String()
    value = fields.Int()
    
class FinancialsDataSchema(Schema):
    bs = fields.List(fields.Nested(FinancialElementImportSchema()))
    cf = fields.List(fields.Nested(FinancialElementImportSchema()))
    ic = fields.List(fields.Nested(FinancialElementImportSchema()))
    
class SymbolFinancialsSchema(Schema):
    startDate = fields.DateTime()
    endDate = fields.DateTime()
    year = fields.Int()
    quarter = fields.String()
    symbol = fields.String()
    name = fields.String()
    country = fields.String()
    city = fields.String()
    data = fields.Nested(FinancialsDataSchema)

# Create a single SymbolFinancialsDto

In [None]:
#json_string = json.dumps([ob.__dict__ for ob in list_name])
#rslt_df = dataframe[dataframe['Percentage'] > 80]
import numpy as np
from pprint import pprint
import os

def npInt_to_str(var):
    return str(list(np.reshape(np.asarray(var), (1, np.size(var)))[0]))[1:-1]

def formatDateNpNum(var):
    dateStr = npInt_to_str(var)
    return dateStr[0:4]+"-"+dateStr[4:6]+"-"+dateStr[6:8]

for subId in range(len(dfSub)): 
    submitted = dfSub.iloc[subId]
    sfDto = SymbolFinancialsDto()
    periodStartDate = date.fromisoformat(formatDateNpNum(submitted["period"]))
    sfDto.startDate = periodStartDate
    sfDto.endDate = date.today() #TODO calc end date
    if submitted["fy"] == np.nan or np.isnan(submitted["fy"]):
        sfDto.year = 0
    else: 
        sfDto.year = submitted["fy"].astype(int)        
    sfDto.quarter = submitted["fp"]
    #symStr = dfSym[dfSym["cik"]==submitted.cik].symbol.str.upper().astype("string")
    #print(symStr)
    val = dfSym[dfSym["cik"]==submitted["cik"]]
    sfDto.symbol = val["symbol"].to_string(index = False).upper()
    sfDto.name = submitted["name"]
    sfDto.country = submitted["countryma"]
    sfDto.city = submitted["cityma"]

    sdDto = FinancialsDataDto()
    filteredBsDfPre = dfPre[(dfPre['stmt'].str.strip() == 'BS') & (dfPre['adsh'] == submitted["adsh"])]
    filteredCfDfPre = dfPre[(dfPre['stmt'].str.strip() == 'CF') & (dfPre['adsh'] == submitted["adsh"])]
    filteredIcDfPre = dfPre[(dfPre['stmt'].str.strip() == 'IC') & (dfPre['adsh'] == submitted["adsh"])]
    #print(filteredDfPre)
    dfNum['value'] = pd.to_numeric(dfNum['value'], errors='coerce')
    dfNum = dfNum.dropna(subset=['value'])
    dfNum['value'] = dfNum['value'].astype(int)
    filteredDfNum = dfNum[dfNum['adsh'] == submitted['adsh']]    
    filteredDfNum.reset_index()

    for myId in range(len(filteredDfNum)): 
        #print(myId)
        #print(filteredDfNum.iloc[myId])
        myNum = filteredDfNum.iloc[myId]
        myDto = FinancialElementImportDto()
        myTag = dfTag[dfTag["tag"] == myNum['tag']]
        myDto.label = myTag["doc"].to_string(index = False)
        myDto.concept = myNum["tag"]
        myPre = dfPre[(dfPre['adsh'] == submitted["adsh"]) & (dfPre['tag'] == myNum['tag'])]
        myDto.info = myPre["plabel"].to_string(index = False)
        myDto.unit = myNum["uom"]
        #print(myNum["value"])
        if myNum["value"] == np.nan or np.isnan(myNum["value"]):
            continue
        myDto.value = myNum["value"]
        #print(myDto.__dict__)
        FinancialElementImportSchema().dump(myDto)
        if myPre['stmt'].to_string(index = False).strip() == 'BS':
            sfDto.data.bs.append(myDto)
        elif myPre['stmt'].to_string(index = False).strip() == 'CF':
            sfDto.data.cf.append(myDto)
        elif myPre['stmt'].to_string(index = False).strip() == 'IC':
            sfDto.data.ic.append(myDto)

    print(len(filteredDfNum))
    #print(submitted)
    #print(sfDto.__dict__)    
    #json.dumps(sfDto.__dict__)
    result = SymbolFinancialsSchema().dump(sfDto)
    #print(result)
    #pprint(result)
    dirname = "2010q1"
    isExist = os.path.exists("exportFiles/"+dirname)
    if not isExist:
        os.makedirs("exportFiles/"+dirname)
    json_file = open("exportFiles/"+dirname+"/"+submitted["adsh"]+".json","w")
    json_file.write(str(result))
    json_file.close()
    print("file "+str(subId+1)+" exportFiles/"+dirname+"/"+submitted["adsh"]+".json stored.")

288
file 1 exportFiles/2010q1/0001193125-10-072854.json stored.
453
file 2 exportFiles/2010q1/0001193125-10-072909.json stored.
271
file 3 exportFiles/2010q1/0001193125-10-073247.json stored.
312
file 4 exportFiles/2010q1/0000060667-10-000064.json stored.
263
file 5 exportFiles/2010q1/0000950123-10-029721.json stored.
270
file 6 exportFiles/2010q1/0000950123-10-029809.json stored.
249
file 7 exportFiles/2010q1/0000950123-10-029845.json stored.
250
file 8 exportFiles/2010q1/0000950123-10-030079.json stored.
245
file 9 exportFiles/2010q1/0000950123-10-030164.json stored.
305
file 10 exportFiles/2010q1/0001104659-10-017258.json stored.
251
file 11 exportFiles/2010q1/0001193125-10-071527.json stored.
313
file 12 exportFiles/2010q1/0001193125-10-071652.json stored.
227
file 13 exportFiles/2010q1/0000104207-10-000039.json stored.
264
file 14 exportFiles/2010q1/0000950123-10-029090.json stored.
244
file 15 exportFiles/2010q1/0001104659-10-017012.json stored.
266
file 16 exportFiles/2010q1/000

361
file 127 exportFiles/2010q1/0000950103-10-000520.json stored.
225
file 128 exportFiles/2010q1/0000950123-10-017254.json stored.
261
file 129 exportFiles/2010q1/0000950123-10-017258.json stored.
355
file 130 exportFiles/2010q1/0000950123-10-017267.json stored.
373
file 131 exportFiles/2010q1/0000950123-10-017377.json stored.
285
file 132 exportFiles/2010q1/0000950123-10-017583.json stored.
316
file 133 exportFiles/2010q1/0000950123-10-017631.json stored.
267
file 134 exportFiles/2010q1/0000950123-10-017647.json stored.
264
file 135 exportFiles/2010q1/0000950123-10-017728.json stored.
573
file 136 exportFiles/2010q1/0000950123-10-017776.json stored.
280
file 137 exportFiles/2010q1/0000950123-10-017817.json stored.
436
file 138 exportFiles/2010q1/0000950123-10-017877.json stored.
318
file 139 exportFiles/2010q1/0000950123-10-017899.json stored.
345
file 140 exportFiles/2010q1/0000950123-10-017949.json stored.
325
file 141 exportFiles/2010q1/0000950123-10-018097.json stored.
295
file 1

293
file 252 exportFiles/2010q1/0001047469-10-001402.json stored.
786
file 253 exportFiles/2010q1/0001104659-10-009750.json stored.
380
file 254 exportFiles/2010q1/0001140361-10-008295.json stored.
199
file 255 exportFiles/2010q1/0001140361-10-008522.json stored.
258
file 256 exportFiles/2010q1/0001157523-10-001218.json stored.
292
file 257 exportFiles/2010q1/0001193125-10-039749.json stored.
274
file 258 exportFiles/2010q1/0001193125-10-039864.json stored.
264
file 259 exportFiles/2010q1/0001193125-10-040106.json stored.
488
file 260 exportFiles/2010q1/0001193125-10-040142.json stored.
469
file 261 exportFiles/2010q1/0001193125-10-040148.json stored.
296
file 262 exportFiles/2010q1/0001193125-10-040175.json stored.
288
file 263 exportFiles/2010q1/0001193125-10-040348.json stored.
328
file 264 exportFiles/2010q1/0001193125-10-040444.json stored.
221
file 265 exportFiles/2010q1/0001193125-10-040500.json stored.
386
file 266 exportFiles/2010q1/0001193125-10-040508.json stored.
231
file 2