### Final Project Requirements/notes: https://docs.google.com/document/d/1mwYbYJHkB7kpx4tNflKh54jN9_oOscw3p4k5fsmn3bc/edit

### Link with all Data: https://www.sec.gov/dera/data/financial-statement-and-notes-data-set.html
- using NUM file only for now (data set of all numeric XBRL facts presented on the primary financial statements)

In [1]:
import pandas as pd
q114numbers = pd.read_table('2015q1_notes/num.tsv', encoding ='latin1')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#get details on all columns
q114numbers.describe(include='all')

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,iprx,value,footnote,footlen,dimn,coreg,durp,datp,dcml
count,7861900,7861900,7861900,7861900.0,7861900.0,7861900,7861900,7234264.0,7741446.0,30005,7861900.0,7861900.0,478036,7234264.0,7234264.0,7234264.0
unique,6996,330113,6797,,,3379,489433,,,26228,,,7294,,,
top,0001507385-15-000030,StockholdersEquity,us-gaap/2014,,,USD,0x00000000,,,Fair value based on forward NYMEX natural gas ...,,,ParentCompany,,,
freq,30871,83056,5527133,,,6753253,3205314,,,21,,,135947,,,
mean,,,,20134720.0,1.829791,,,0.0008002196,6632962000.0,,0.8345829,0.8512652,,0.001982165,0.08077286,2386.459
std,,,,9058.241,2.279711,,,0.03582441,3207053000000.0,,21.50437,0.9051028,,0.03876579,1.697204,10354.19
min,,,,19681230.0,0.0,,,0.0,-7663837000000.0,,0.0,0.0,,-0.4986305,-15.0,-32768.0
25%,,,,20131230.0,0.0,,,0.0,0.361,,0.0,0.0,,0.0,0.0,-3.0
50%,,,,20140330.0,1.0,,,0.0,1200000.0,,0.0,1.0,,0.0,0.0,-3.0
75%,,,,20141230.0,4.0,,,0.0,27546000.0,,0.0,1.0,,0.01095891,0.0,0.0


### From "Financial Statement and Notes Data Sets" Readme:
These fields comprise a unique compound key:

1) **adsh - EDGAR accession number**: a unique identifier assigned automatically to an accepted submission by the EDGAR Filer System; The first set of numbers (0001193125) is the CIK of the entity submitting the filing. The next 2 numbers (18) represent the year. The last series of numbers represent a sequential count of submitted filings from that CIK. The count is usually, but not always, reset to 0 at the start of each calendar year.
- **TODO**: separate these numbers to identify a company or a financial filing, there were 6,492 individual filings

2) **tag** - tag used by the filer 
- **TODO**: may have to separate out first word from tag to identify broader groups such as revenue

3) **version** – if a standard tag, the taxonomy of origin, otherwise equal to adsh.

4) **ddate** - period end date

5) **qtrs** - duration in number of quarters

6) **uom** - unit of measure

7) **dimh** - 16-byte dimensional qualifier

8) **iprx** - a sequential integer used to distinguish otherwise identical facts

9) **coreg** - If specified, indicates a specific co-registrant, the parent company, or other entity (e.g., guarantor).  NULL indicates the consolidated entity.  Note that this value is a function of the dimension segments.

10) **durp** - The difference between the reported fact duration and the quarter duration (qtrs), expressed as a fraction of 1.  For example, a fact with duration of 120 days rounded to a 91-day quarter has a durp value of 29/91 = +0.3187.

11) **datp** - The difference between the reported fact date and the month-end rounded date (ddate), expressed as a fraction of 1.  For example, a fact reported for 29/Dec, with ddate rounded to 31/Dec, has a datp value of minus 2/31 = -0.0645.
 
12) **dcml** - The value of the fact "decimals" attribute, with INF represented by 32767.

In [3]:
q114numbers.isnull().sum()

adsh              0
tag               0
version           0
ddate             0
qtrs              0
uom               0
dimh              0
iprx         627636
value        120454
footnote    7831895
footlen           0
dimn              0
coreg       7383864
durp         627636
datp         627636
dcml         627636
dtype: int64

#### A lot of null values for footnotes and coregistrants (majority of rows); will remove these columns for now

In [4]:
q114numbers = q114numbers.drop(columns=['footnote','coreg'])

In [5]:
q114numbers.isnull().sum() 

adsh            0
tag             0
version         0
ddate           0
qtrs            0
uom             0
dimh            0
iprx       627636
value      120454
footlen         0
dimn            0
durp       627636
datp       627636
dcml       627636
dtype: int64

In [6]:
q114numbers = q114numbers.drop(columns=['iprx','durp','datp','dcml'])

#### Didn't get rid of blank Value fields, so just get rid of those since there will still be enough data points to analyze

In [7]:
q114numbers = q114numbers.dropna()

In [8]:
q114numbers.describe(include='all') #still have 7million+ data points

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,value,footlen,dimn
count,7741446,7741446,7741446,7741446.0,7741446.0,7741446,7741446,7741446.0,7741446.0,7741446.0
unique,6996,329406,6796,,,3379,487154,,,
top,0001507385-15-000030,StockholdersEquity,us-gaap/2014,,,USD,0x00000000,,,
freq,30866,81680,5443445,,,6644106,3155339,,,
mean,,,,20134730.0,1.827412,,,6632962000.0,0.8335549,0.8521903
std,,,,9018.806,2.272527,,,3207053000000.0,21.35654,0.9061745
min,,,,19681230.0,0.0,,,-7663837000000.0,0.0,0.0
25%,,,,20131230.0,0.0,,,0.361,0.0,0.0
50%,,,,20140330.0,1.0,,,1200000.0,0.0,1.0
75%,,,,20141230.0,4.0,,,27546000.0,0.0,1.0


In [9]:
#break out adsh to cik and filing number
# s = q114numbers['adsh'].str.split('-', n = 1, expand = True)

In [10]:
# s.head()

In [11]:
# q114numbers['entity_CIK'] = s[0]
# q114numbers['filing_number'] = s[1]
# q114numbers.head()

# System; The first set of numbers (0001193125) is the CIK of the entity submitting the filing. 
# The next 2 numbers (18) represent the year. 
# The last series of numbers represent a sequential count of submitted filings from that CIK. 
# The count is usually, but not always, reset to 0 at the start of each calendar year.

In [12]:
# q414numbers = q414numbers.drop(columns =["adsh"], inplace = True)
# q414numbers = q414numbers.to_frame()

In [24]:
#Valeant's numbers
valeant_10k = q114numbers.loc[q114numbers['adsh'] == '0000885590-15-000015']
valeant_10k.head()

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,dimh,value,footlen,dimn
6051,0000885590-15-000015,AccruedLiabilitiesCurrent,us-gaap/2014,20131231,0,USD,0x00000000,1800200000.0,0,0
6052,0000885590-15-000015,AccruedLiabilitiesCurrent,us-gaap/2014,20141231,0,USD,0x00000000,2179400000.0,0,0
6053,0000885590-15-000015,CostsAndExpenses,us-gaap/2014,20121231,4,USD,0x00000000,3400700000.0,0,0
6054,0000885590-15-000015,CostsAndExpenses,us-gaap/2014,20131231,4,USD,0x00000000,6179100000.0,0,0
6055,0000885590-15-000015,CostsAndExpenses,us-gaap/2014,20141231,4,USD,0x00000000,6223800000.0,0,0


In [25]:
all_valeant = q114numbers[q114numbers['adsh'].str.match('0000885590')]

# s.str.startswith('a', na=False)
# 0000885590

In [26]:
len(all_valeant)

2632

In [27]:
len(valeant_10k)

2632

In [28]:
valeant_nums = list(set(valeant_10k.value.abs()))
valeant_nums

[0.0,
 204800000.0,
 2.0,
 0.5,
 25600000.0,
 128000000.0,
 0.25,
 1.0,
 3.0,
 4.0,
 5.0,
 6.0,
 17.75,
 19.57,
 21.78,
 22.0,
 24.0,
 30.47,
 30.19,
 2900000.0,
 79700000.0,
 2306900000.0,
 31.44,
 4022100000.0,
 1410900000.0,
 130900000.0,
 39.11,
 54100000.0,
 1436500000.0,
 44.0,
 51.34,
 507957.0,
 54.6,
 61.8,
 62.15,
 5800000.0,
 364200000.0,
 31400000.0,
 108200000.0,
 57000000.0,
 74.88,
 85.0,
 8700000.0,
 9224700000.0,
 341500000.0,
 85500000.0,
 100.0,
 102.22,
 117.82,
 14398800000.0,
 37200000.0,
 11600000.0,
 139600000.0,
 114000000.0,
 136.59,
 137.71,
 158.0,
 14500000.0,
 884900000.0,
 449700000.0,
 160.44,
 1089700000.0,
 1092600000.0,
 17400000.0,
 43000000.0,
 196600000.0,
 299000000.0,
 219.79,
 327500000.0,
 8263500000.0,
 353100000.0,
 122700000.0,
 4193100000.0,
 20300000.0,
 1581900000.0,
 3885900000.0,
 381600000.0,
 23200000.0,
 125600000.0,
 253600000.0,
 3300000000.0,
 151200000.0,
 3200000000.0,
 500000.0,
 77300000.0,
 845300000.0,
 387400000.0,
 1826000

In [29]:
len(valeant_nums)

1416

In [30]:
def first_digit(number):
    return int(str(number*100)[0])    

In [31]:
benford_valeant = [first_digit(num) for num in valeant_nums] 
len(benford_valeant)

1416

In [32]:
benford_valeant

[0,
 2,
 2,
 5,
 2,
 1,
 2,
 1,
 3,
 4,
 5,
 6,
 1,
 1,
 2,
 2,
 2,
 3,
 3,
 2,
 7,
 2,
 3,
 4,
 1,
 1,
 3,
 5,
 1,
 4,
 5,
 5,
 5,
 6,
 6,
 5,
 3,
 3,
 1,
 5,
 7,
 8,
 8,
 9,
 3,
 8,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 8,
 4,
 1,
 1,
 1,
 1,
 4,
 1,
 2,
 2,
 3,
 8,
 3,
 1,
 4,
 2,
 1,
 3,
 3,
 2,
 1,
 2,
 3,
 1,
 3,
 5,
 7,
 8,
 3,
 1,
 2,
 2,
 3,
 5,
 1,
 9,
 9,
 3,
 2,
 1,
 8,
 6,
 1,
 1,
 8,
 3,
 5,
 2,
 6,
 1,
 3,
 1,
 1,
 3,
 1,
 1,
 1,
 6,
 1,
 1,
 8,
 1,
 1,
 1,
 6,
 2,
 4,
 3,
 7,
 4,
 1,
 2,
 2,
 4,
 1,
 7,
 1,
 7,
 9,
 1,
 2,
 1,
 7,
 7,
 1,
 7,
 6,
 5,
 2,
 2,
 5,
 3,
 6,
 6,
 8,
 4,
 1,
 3,
 2,
 5,
 6,
 8,
 6,
 4,
 1,
 1,
 3,
 2,
 1,
 4,
 2,
 1,
 7,
 5,
 2,
 1,
 6,
 1,
 7,
 1,
 1,
 9,
 9,
 8,
 2,
 2,
 2,
 6,
 1,
 2,
 4,
 1,
 2,
 1,
 5,
 1,
 4,
 5,
 1,
 3,
 1,
 5,
 9,
 1,
 5,
 6,
 6,
 3,
 1,
 3,
 3,
 1,
 2,
 8,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 4,
 2,
 6,
 2,
 1,
 4,
 7,
 2,
 2,
 1,
 1,
 5,
 2,
 7,
 2,
 1,
 1,
 1,
 1,
 9,
 5,
 5,
 8,
 5,
 7,
 1,
 1,


In [33]:
def frequencies(first_digits):
    counts = [0]*10
    for x in first_digits:
        if x > 0:
            counts[x] += 1 
    total = sum(counts)
    freq = [count/total for count in counts]
#     print(sum(freq))
    return freq[1:] #same as going from element 1 through the end (i.e. 9)

In [34]:
frequencies(benford_valeant)

[0.31063829787234043,
 0.18581560283687942,
 0.12340425531914893,
 0.07588652482269503,
 0.07801418439716312,
 0.06595744680851064,
 0.05602836879432624,
 0.05390070921985816,
 0.05035460992907802]

In [35]:
import math
r = range(10)
benfords_law = [math.log10(1 + 1/digit) for digit in r if digit != 0]
benfords_law

[0.3010299956639812,
 0.17609125905568124,
 0.12493873660829993,
 0.09691001300805642,
 0.07918124604762482,
 0.06694678963061322,
 0.05799194697768673,
 0.05115252244738129,
 0.04575749056067514]

In [36]:
len(benford_valeant)

1416

In [37]:
import plotly

plotly.offline.init_notebook_mode(connected=True)
Benfords_Law = {'type': 'scatter', 'x': list(range(1, 10)), 'y': benfords_law}
Valeant = {'type': 'scatter', 'x': list(range(1, 10)), 'y': frequencies(benford_valeant)}


plotly.offline.iplot([Benfords_Law, Valeant])

In [55]:
import math
def kl_divergence(freq):
    kl_div = 0.0
    for d in range(1, 10):
        Q = (math.log(d+1) - math.log(d)) / math.log(10) #calculates theoretical benfords (perfect benny freqs)
        P = freq[d-1]
        kl_div += math.log(P / Q) * P
#         print((freq))

    return round(float(kl_div),3)

In [56]:
kl_divergence(frequencies(benford_valeant))

0.003

In [57]:
kl_divergence(benfords_law)

0.0

In [None]:
# q414numbers.join(s.apply(lambda x: x.split('-')))

In [None]:
# q414numbers= q414numbers.drop('adsh', axis=1).join(s.reset_index(drop=True, level=1).rename(['0'],['1'],['2']))


In [None]:
# q414numbers.head()

In [None]:
import numpy as np
import random
def p_value(freq):
    n = sum(freq)
    ps = [(math.log(d+1) - math.log(d)) / math.log(10) for d in range(1, 10)]

    ks_obs = freq[1:]

    def ll(ks): # log-likelihood
        z = random.sample(zip(ks, ps), 8)
        return sum([k * math.log(p) for (k, p) in z])

    N = 10000
    P = 0
    for i in range(N):
        ks = np.random.multinomial(n, ps)
        if ll(ks) > ll(ks_obs): P += 1

In [None]:
p_value(benford_valeant)

In [None]:
ks_obs = freq[1:]