# Build a dataset out of all data collected

In [2]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from pprint import pprint
import logging
# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('company_overview_dataset.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)

Convert jsons into an easy to use dict

In [13]:
overviews = {}
# iterate over all json files in the prices folder
for file in tqdm(os.listdir('overview')):
    # open the json file
    with open(os.path.join('overview', file)) as f:
        overview = json.load(f)
        overviews[overview[0]['Symbol']] = overview
# pprint(overviews)

100%|██████████| 76/76 [00:00<00:00, 873.56it/s]


### Create the overview dataframe

Only take values that could be converted to float

In [14]:
# create a row for each stock and date
keys = ['Symbol']
rows = []
for stock, datas in tqdm(overviews.items()):
    cols = [stock]
    for key, val in datas[0].items():
        try:
            cols.append(float(val))
            if key not in keys:
                keys.append(key)
        # create an exception when the value is not a number
        except ValueError:
            pass
    rows.append(cols)
# create a dataframe from the rows
price_df = pd.DataFrame(rows, columns=keys)
price_df.to_csv('overview.csv', index=False)
price_df
# convert all columns to numeric except for the date and stock
# price_df[price_df.columns[2:]] = price_df[price_df.columns[2:]].apply(pd.to_numeric, errors='coerce')

100%|██████████| 76/76 [00:00<00:00, 38007.29it/s]


Unnamed: 0,Symbol,CIK,MarketCapitalization,EBITDA,PERatio,PEGRatio,BookValue,DividendPerShare,DividendYield,EPS,...,PriceToSalesRatioTTM,PriceToBookRatio,EVToRevenue,EVToEBITDA,Beta,52WeekHigh,52WeekLow,50DayMovingAverage,200DayMovingAverage,SharesOutstanding
0,AAPL,320193.0,2.400657e+12,1.252880e+11,25.760,2.750,3.581,0.910,0.0060,5.89,...,5.510,44.630,5.920,17.530,1.278,178.80,124.17,138.92,1.477200e+02,1.582190e+10
1,ABBV,1551152.0,2.567833e+11,2.980100e+10,19.360,1.279,9.040,5.640,0.0408,7.50,...,4.989,18.230,6.040,13.800,0.583,169.46,131.49,157.82,1.492400e+02,1.768480e+09
2,ABNB,1559720.0,7.493650e+10,1.683518e+09,47.150,8.750,0.000,0.000,2.5100,12.63,...,10.740,6.500,28.410,1.013,191.730,81.91,96.45,109.23,3.982470e+08,
3,AMBA,1280263.0,3.560209e+09,-4.960200e+07,4.965,15.190,0.000,0.000,-0.7160,9.09,...,8.150,-64.060,1.552,156.100,49.020,82.38,74.05,38756900.00,,
4,AMD,2488.0,1.388081e+11,5.547000e+09,104.990,0.902,33.840,0.000,0.0000,0.82,...,4.699,2.038,4.750,18.860,1.981,132.96,54.57,70.90,7.934000e+01,1.612360e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,VZ,732712.0,1.743345e+11,4.756600e+10,8.200,5.510,21.700,2.585,0.0629,5.06,...,1.169,1.798,2.427,7.190,0.352,53.08,32.79,39.43,4.332000e+01,4.199820e+09
72,WBD,1437107.0,3.739736e+10,4.382000e+09,8.510,1.389,19.980,0.000,0.0000,1.81,...,0.934,0.550,2.851,6.450,1.514,31.12,8.82,11.77,1.366000e+01,2.428400e+09
73,WMT,104169.0,3.793858e+11,3.489400e+10,43.290,3.834,26.750,2.230,0.0158,3.25,...,0.653,5.470,0.749,17.500,0.525,159.88,116.32,145.45,1.369200e+02,2.696800e+09
74,XOM,34088.0,4.560819e+11,9.112800e+10,8.430,1.689,47.780,3.550,0.0325,13.26,...,1.115,2.268,1.203,5.320,1.097,117.78,71.98,109.88,9.881000e+01,4.082000e+09


# TODO repeat this process for all other data sheets in the jsons