### Final Project Requirements/notes: https://docs.google.com/document/d/1mwYbYJHkB7kpx4tNflKh54jN9_oOscw3p4k5fsmn3bc/edit

### Link with all Data: https://www.sec.gov/dera/data/financial-statement-and-notes-data-set.html
- using NUM file only for now (data set of all numeric XBRL facts presented on the primary financial statements)

In [1]:
import pandas as pd
q114numbers = pd.read_table('2014q1_notes/num.tsv', encoding ='latin1')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#get details on all columns
q114numbers.describe(include='all')

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,iprx,value,footnote,footlen,dimn,coreg,durp,datp,dcml
count,8047269,8047269,8047269,8047269.0,8047269.0,8047269,8047269,7397212.0,7813542.0,32410,8047269.0,8047269.0,524781,7397212.0,7397212.0,7397212.0
unique,7161,364765,6984,,,3467,478982,,,28014,,,7561,,,
top,0001104659-14-010697,StockholdersEquity,us-gaap/2013,,,USD,0x00000000,,,Fair value based on forward NYMEX natural gas ...,,,ParentCompany,,,
freq,25851,96149,5645985,,,6919631,3318312,,,21,,,136050,,,
mean,,,,20124290.0,1.970625,,,0.0009592803,-26432710000.0,,0.8900422,0.8271273,,0.001627351,0.1064738,1737.75
std,,,,10920.64,3.300027,,,0.04284528,86587190000000.0,,23.06192,0.88178,,0.04504874,1.592975,10820.29
min,,,,19630630.0,0.0,,,0.0,-2.420141e+17,,0.0,0.0,,-0.4986305,-15.0,-32768.0
25%,,,,20121230.0,0.0,,,0.0,0.4,,0.0,0.0,,0.0,0.0,-3.0
50%,,,,20130330.0,1.0,,,0.0,1100000.0,,0.0,1.0,,0.0,0.0,-3.0
75%,,,,20131230.0,4.0,,,0.0,25276000.0,,0.0,1.0,,0.01095891,0.0,0.0


### From "Financial Statement and Notes Data Sets" Readme:
These fields comprise a unique compound key:

1) **adsh - EDGAR accession number**: a unique identifier assigned automatically to an accepted submission by the EDGAR Filer System; The first set of numbers (0001193125) is the CIK of the entity submitting the filing. The next 2 numbers (18) represent the year. The last series of numbers represent a sequential count of submitted filings from that CIK. The count is usually, but not always, reset to 0 at the start of each calendar year.
- **TODO**: separate these numbers to identify a company or a financial filing, there were 6,492 individual filings

2) **tag** - tag used by the filer 
- **TODO**: may have to separate out first word from tag to identify broader groups such as revenue

3) **version** – if a standard tag, the taxonomy of origin, otherwise equal to adsh.

4) **ddate** - period end date

5) **qtrs** - duration in number of quarters

6) **uom** - unit of measure

7) **dimh** - 16-byte dimensional qualifier

8) **iprx** - a sequential integer used to distinguish otherwise identical facts

9) **coreg** - If specified, indicates a specific co-registrant, the parent company, or other entity (e.g., guarantor).  NULL indicates the consolidated entity.  Note that this value is a function of the dimension segments.

10) **durp** - The difference between the reported fact duration and the quarter duration (qtrs), expressed as a fraction of 1.  For example, a fact with duration of 120 days rounded to a 91-day quarter has a durp value of 29/91 = +0.3187.

11) **datp** - The difference between the reported fact date and the month-end rounded date (ddate), expressed as a fraction of 1.  For example, a fact reported for 29/Dec, with ddate rounded to 31/Dec, has a datp value of minus 2/31 = -0.0645.
 
12) **dcml** - The value of the fact "decimals" attribute, with INF represented by 32767.

In [3]:
q114numbers.isnull().sum()

adsh              0
tag               0
version           0
ddate             0
qtrs              0
uom               0
dimh              0
iprx         650057
value        233727
footnote    8014859
footlen           0
dimn              0
coreg       7522488
durp         650057
datp         650057
dcml         650057
dtype: int64

#### A lot of null values for footnotes and coregistrants (majority of rows); will remove these columns for now

In [4]:
q114numbers = q114numbers.drop(columns=['footnote','coreg'])

In [5]:
q114numbers.isnull().sum() 

adsh            0
tag             0
version         0
ddate           0
qtrs            0
uom             0
dimh            0
iprx       650057
value      233727
footlen         0
dimn            0
durp       650057
datp       650057
dcml       650057
dtype: int64

In [6]:
q114numbers = q114numbers.drop(columns=['iprx','durp','datp','dcml'])

#### Didn't get rid of blank Value fields, so just get rid of those since there will still be enough data points to analyze

In [7]:
q114numbers = q114numbers.dropna()

In [8]:
q114numbers.describe(include='all') #still have 7million+ data points

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,value,footlen,dimn
count,7813542,7813542,7813542,7813542.0,7813542.0,7813542,7813542,7813542.0,7813542.0,7813542.0
unique,7161,363442,6982,,,3467,473091,,,
top,0001104659-14-010697,StockholdersEquity,us-gaap/2013,,,USD,0x00000000,,,
freq,25849,92920,5470777,,,6705184,3242279,,,
mean,,,,20124360.0,1.964117,,,-26432710000.0,0.9017542,0.8231845
std,,,,10627.55,3.290258,,,86587190000000.0,23.01161,0.8812625
min,,,,19630630.0,0.0,,,-2.420141e+17,0.0,0.0
25%,,,,20121230.0,0.0,,,0.4,0.0,0.0
50%,,,,20130330.0,1.0,,,1100000.0,0.0,1.0
75%,,,,20131230.0,4.0,,,25276000.0,0.0,1.0


In [9]:
#break out adsh to cik and filing number
# s = q114numbers['adsh'].str.split('-', n = 1, expand = True)

In [10]:
# s.head()

In [11]:
# q114numbers['entity_CIK'] = s[0]
# q114numbers['filing_number'] = s[1]
# q114numbers.head()

# System; The first set of numbers (0001193125) is the CIK of the entity submitting the filing. 
# The next 2 numbers (18) represent the year. 
# The last series of numbers represent a sequential count of submitted filings from that CIK. 
# The count is usually, but not always, reset to 0 at the start of each calendar year.

In [12]:
# q414numbers = q414numbers.drop(columns =["adsh"], inplace = True)
# q414numbers = q414numbers.to_frame()

In [13]:
#Valeant's numbers
valeant_10k = q114numbers.loc[q114numbers['adsh'] == '0000885590-14-000025']
valeant_USD = valeant_10k.loc[valeant_10k['uom']== 'USD']
valeant_USD

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,value,footlen,dimn
76663,0000885590-14-000025,ComprehensiveIncomeNetOfTax,us-gaap/2013,20111231,4,USD,0x00000000,-2.188930e+08,0,0
76664,0000885590-14-000025,ComprehensiveIncomeNetOfTax,us-gaap/2013,20121231,4,USD,0x00000000,4.419500e+07,0,0
76665,0000885590-14-000025,ComprehensiveIncomeNetOfTax,us-gaap/2013,20131231,4,USD,0x00000000,-8.795260e+08,0,0
76666,0000885590-14-000025,OtherNoncashIncomeExpense,us-gaap/2013,20111231,4,USD,0x00000000,1.841800e+07,0,0
76667,0000885590-14-000025,OtherNoncashIncomeExpense,us-gaap/2013,20121231,4,USD,0x00000000,3.369300e+07,0,0
76668,0000885590-14-000025,OtherNoncashIncomeExpense,us-gaap/2013,20131231,4,USD,0x00000000,-4.660000e+05,0,0
76669,0000885590-14-000025,Revenues,us-gaap/2013,20111231,4,USD,0x00000000,2.427450e+09,0,0
76670,0000885590-14-000025,Revenues,us-gaap/2013,20121231,4,USD,0x00000000,3.480376e+09,0,0
76671,0000885590-14-000025,Revenues,us-gaap/2013,20131231,4,USD,0x00000000,5.769605e+09,0,0
76695,0000885590-14-000025,DebtInstrumentUnamortizedDiscount,us-gaap/2013,20131231,0,USD,0x00000000,2.692340e+08,0,0


In [14]:
# all_valeant = q114numbers[q114numbers['adsh'].str.match('0000885590')]

# s.str.startswith('a', na=False)
# 0000885590

In [15]:
# len(all_valeant)

In [16]:
len(valeant_10k)

2886

In [17]:
valeant_nums = list(set(valeant_USD.value.abs()))
valeant_nums

[0.0,
 128000000.0,
 1.0,
 6144000.0,
 76800000.0,
 3200000000.0,
 24789000.0,
 9.59,
 10.54,
 11.68,
 455082000.0,
 6570000.0,
 18.97,
 19.57,
 20.76,
 19.71,
 22.85,
 20.42,
 284287000.0,
 25.07,
 19071000.0,
 25.42,
 24.0,
 30.19,
 30.47,
 2900000.0,
 33.43,
 34.11,
 28500000.0,
 38.13,
 39.74,
 39.35,
 39.11,
 43.4,
 44.4,
 44.0,
 29950000.0,
 50.56,
 51.06,
 51.86,
 110035000.0,
 7635000.0,
 59.03,
 59.15,
 8872000.0,
 5800000.0,
 364200000.0,
 69.35,
 262226000.0,
 80.47,
 84.01,
 85.0,
 14631000.0,
 219431000.0,
 88.73,
 91.12,
 93.6,
 46588000.0,
 8700000.0,
 59900000.0,
 384508000.0,
 102.22,
 104.21,
 879526000.0,
 2171000.0,
 56443000.0,
 336000.0,
 229712000.0,
 114000000.0,
 57893000.0,
 35365000.0,
 1041957000.0,
 117285000.0,
 140.55,
 136.68,
 100303000.0,
 973988000.0,
 1345700000.0,
 142500000.0,
 4911096000.0,
 171000000.0,
 205000.0,
 5325000.0,
 82338000.0,
 115319000.0,
 353100000.0,
 3270476000.0,
 20300000.0,
 547873000.0,
 15606000.0,
 35062000.0,
 56779000.0,


In [18]:
len(valeant_nums)

1890

In [35]:
def first_digit(number):
    return int(str(number)[0])    

In [36]:
benford_valeant = [first_digit(num) for num in valeant_nums] 
len(benford_valeant)

1890

In [37]:
benford_valeant

[0,
 1,
 1,
 6,
 7,
 3,
 2,
 9,
 1,
 1,
 4,
 6,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 3,
 3,
 2,
 3,
 3,
 2,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 2,
 5,
 5,
 5,
 1,
 7,
 5,
 5,
 8,
 5,
 3,
 6,
 2,
 8,
 8,
 8,
 1,
 2,
 8,
 9,
 9,
 4,
 8,
 5,
 3,
 1,
 1,
 8,
 2,
 5,
 3,
 2,
 1,
 5,
 3,
 1,
 1,
 1,
 1,
 1,
 9,
 1,
 1,
 4,
 1,
 2,
 5,
 8,
 1,
 3,
 3,
 2,
 5,
 1,
 3,
 5,
 9,
 2,
 1,
 2,
 1,
 3,
 2,
 1,
 5,
 2,
 7,
 1,
 4,
 3,
 3,
 3,
 3,
 6,
 5,
 7,
 3,
 5,
 2,
 3,
 1,
 1,
 9,
 1,
 1,
 3,
 9,
 3,
 1,
 1,
 1,
 3,
 8,
 9,
 1,
 1,
 6,
 1,
 8,
 3,
 1,
 4,
 1,
 8,
 1,
 1,
 2,
 3,
 3,
 2,
 1,
 4,
 4,
 1,
 2,
 4,
 2,
 1,
 2,
 1,
 1,
 1,
 9,
 4,
 2,
 1,
 9,
 3,
 2,
 2,
 8,
 7,
 6,
 5,
 1,
 1,
 1,
 6,
 6,
 3,
 4,
 1,
 3,
 5,
 1,
 1,
 1,
 1,
 6,
 9,
 5,
 8,
 1,
 1,
 8,
 1,
 2,
 1,
 2,
 6,
 1,
 8,
 7,
 3,
 1,
 2,
 2,
 2,
 1,
 7,
 6,
 3,
 3,
 5,
 2,
 1,
 4,
 2,
 1,
 3,
 6,
 3,
 1,
 9,
 3,
 6,
 2,
 2,
 2,
 2,
 1,
 4,
 1,
 2,
 1,
 3,
 4,
 8,
 2,
 3,
 9,
 1,
 1,
 2,
 3,
 1,
 1,
 2,
 9,
 2,
 1,
 1,
 1,
 9,


In [43]:
def frequencies(first_digits):
    counts = [0]*10
    for x in first_digits:
        if x > 0:
            counts[x] += 1 
    total = sum(counts)
    freq = [count/total for count in counts]
    print(sum(freq))
    return freq[1:] #same as going from element 1 through the end (i.e. 9)

In [44]:
frequencies(benford_valeant)

1.0000000000000002


[0.2791932059447983,
 0.18471337579617833,
 0.131104033970276,
 0.09554140127388536,
 0.08227176220806794,
 0.07537154989384288,
 0.05148619957537155,
 0.056794055201698515,
 0.0435244161358811]

In [40]:
import math
r = range(10)
benfords_law = [math.log10(1 + 1/digit) for digit in r if digit != 0]
benfords_law

[0.3010299956639812,
 0.17609125905568124,
 0.12493873660829993,
 0.09691001300805642,
 0.07918124604762482,
 0.06694678963061322,
 0.05799194697768673,
 0.05115252244738129,
 0.04575749056067514]

In [41]:
len(benford_valeant)

1890

In [42]:
import plotly

plotly.offline.init_notebook_mode(connected=True)
Benfords_Law = {'type': 'scatter', 'x': list(range(1, 10)), 'y': benfords_law}
Valeant = {'type': 'scatter', 'x': list(range(1, 10)), 'y': frequencies(benford_valeant)}


plotly.offline.iplot([Benfords_Law, Valeant])

In [27]:
import math
def kl_divergence(freq):
    kl_div = 0.0
    for d in range(1, 10):
        Q = (math.log(d+1) - math.log(d)) / math.log(10) #calculates theoretical benfords (perfect benny freqs)
        P = freq[d-1]
        kl_div += math.log(P / Q) * P
#         print((freq))

    return float(kl_div)

In [28]:
kl_divergence(frequencies(benford_valeant))

0.0024374317777151335

In [29]:
kl_divergence(benfords_law)

4.6614422691971963e-17

In [30]:
# q414numbers.join(s.apply(lambda x: x.split('-')))

In [31]:
# q414numbers= q414numbers.drop('adsh', axis=1).join(s.reset_index(drop=True, level=1).rename(['0'],['1'],['2']))


In [32]:
# q414numbers.head()

In [33]:
import numpy as np
import random
def p_value(freq):
    n = sum(freq)
    ps = [(math.log(d+1) - math.log(d)) / math.log(10) for d in range(1, 10)]

    ks_obs = freq[1:]

    def ll(ks): # log-likelihood
        z = random.sample(zip(ks, ps), 8)
        return sum([k * math.log(p) for (k, p) in z])

    N = 10000
    P = 0
    for i in range(N):
        ks = np.random.multinomial(n, ps)
        if ll(ks) > ll(ks_obs): P += 1

In [34]:
p_value(benford_valeant)

TypeError: Population must be a sequence or set.  For dicts, use list(d).

In [None]:
ks_obs = freq[1:]