# FIN-525 Financial Big Data Project

## Importation

In [7]:
#Working
import pandas as pd
import numpy as np
from benfordslaw import benfordslaw

#Plotting
import seaborn as sns
import matplotlib.pyplot as plt

#DB
import wrds
import yfinance as yf
from sec_edgar_downloader import Downloader

#Utils
from tqdm import tqdm
from time import time
import pickle as pkl
import os



## Constants

In [9]:
RANDOM_SEED = 7
DATA_DIR = "../data/"
INTERM_DIR = '../compiled_data'

## Data Query

### WRDS

In [None]:
db = wrds.Connection(wrds_username='debabech', wrds_passeword='Electro1004$')
db.create_pgpass_file()
permcos = db.get_table(library='crsp', table='stocknames')[
    ["permco", "ticker"]]

### EDGAR

In [15]:
ticker = "Apple"

path = os.path.join(DATA_DIR+"edgar/", ticker)
os.mkdir(path)
dl = Downloader(DATA_DIR+"edgar/" +ticker)

# Get all 13F filings for Apple
info = dl.get("13F-NT", ticker) # , after="2017-01-01", before="2017-03-25") # "13F-NT"

## Theory

In [None]:
# get_random_data

# This is a simple function which creates 1000 random values between 1 and 1000.

def get_random_data():  
    """
    Returns a list of 1000 numbers approximately
    following the uniform distribution NOT the
    Benford Distribution.
    """
    random_data = [0] * 1000
    random_data = list(map(lambda n: n + random.randint(1, 1000), random_data))
    return random_data

In [None]:
def ben_theoritical():  
    """
    Returns a list of 1000 numbers following
    the Benford's law Distribution.
    """
    results=[]
    
    for n in range(10):
        results.append(np.log10(n + 1) - np.log10(n))
    
    return results

Plotting

In [None]:
plt.rc('font', size=16)
fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(digits, digit_probs)
plt.xticks(digits)
plt.xlabel('Digits')
plt.ylabel('Probability')
plt.title("Benford's Law: Probability of Leading Digits")
plt.show()


# Case Study

## Fibonacci Series

In [None]:
# Calculates and stores the first n = 1000 Fibonacci numbers
def fibonacci(n):
    fibs = [1, 1]
    for i in range(2, n + 1):
        fibs.append(fibs[i - 1] + fibs[i - 2])
    return fibs

fib_nums = fibonacci(1000)

# Calculate the number of leading digits for 1000 Fibonacci Numbers
def leading_digit_count(numbers):
    digit_dict = { 'digit': np.arange(1,10),
                   'prob' : np.zeros(9),
                   'count': np.zeros(9) }
    for num in numbers:
        first_digit = int(str(num)[:1])
        ind = np.where(digit_dict['digit'] == first_digit)
        digit_dict['count'][ind] =  digit_dict['count'][ind] +1 
    
    digit_dict['prob'] = digit_dict['count'] / len(numbers)
    
    return digit_dict

leading_digit_prob = leading_digit_count(fib_nums)

sse0 = np.sum((leading_digit_prob['prob'] - digit_probs) ** 2)

print('Sum of squared errors is ', sse0)


In [None]:
fig, axs = plt.subplots(1, 4, figsize=(20,5))

for i, ax in enumerate(axs):
    n = 10 ** (i + 1)
    fib_nums = fibonacci(n)
    leading_digit_prob = leading_digit_count(fib_nums)
    sse0 = np.sum((leading_digit_prob['prob'] - digit_probs) ** 2)
    
    ax.bar(leading_digit_prob['digit'], leading_digit_prob['prob'], width=0.25)
    ax.bar(digits + 0.25, digit_probs, width = 0.25)
    
    ax.set_xticks(leading_digit_prob['digit'])
    ax.set_xlabel('Digits')
    ax.set_ylabel('Probability')
    ax.set_title(f'n = {n}, SSE = {sse0:.2e}')
    
    ax.legend(labels=['Fibonacci', "Benford's Law"])
    
plt.suptitle(f'Probability of Leading Digits', fontsize=16)
plt.show()


## US elections

In [None]:
# Initialize
bl = benfordslaw(alpha=0.05)

# Load elections example
df = bl.import_example(data='USA')

# Extract election information.
X = df['votes'].loc[df['candidate']=='Donald Trump'].values

# Print
print(X)
# array([ 5387, 23618,  1710, ...,    16,    21,     0], dtype=int64)

# Make fit
results = bl.fit(X)

# Plot
bl.plot(title='Donald Trump')

# ================== Second digit =================

# Initialize and set to analyze the second digit postion
bl = benfordslaw(pos=2)
# USA example
df = bl.import_example(data='USA')
results = bl.fit(X)
# Plot
bl.plot(title='Donald Trump', barcolor=[0.5, 0.5, 0.5], fontsize=12, barwidth=0.4)

# ================== Last digit =================

# Initialize and set to analyze the last postion
bl = benfordslaw(pos=-1)
# USA example
df = bl.import_example(data='USA')
results = bl.fit(X)
# Plot
bl.plot(title='Donald Trump', barcolor=[0.5, 0.5, 0.5], fontsize=12, barwidth=0.4)

# ================== Second last digit =================

# Initialize and set to analyze the last postion
bl = benfordslaw(pos=-2)
# USA example
df = bl.import_example(data='USA')
results = bl.fit(X)
# Plot
bl.plot(title='Donald Trump', barcolor=[0.5, 0.5, 0.5], fontsize=12, barwidth=0.4)