In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import simfin as sf
#Used for column names
from simfin.names import *

#SIMFIN params
# Where all the csvs are stored
sf.set_data_dir('./simfin_data/')
# Simfin needs an api key (Using simfins free key so only get stock info from before 2018 )
sf.load_api_key(path='~/simfin_api_key.txt', default_key='free')
# offset the fundamental data by 50 days the simfin documentation also
#recommends doing this..
dateOffset = pd.DateOffset(days=50)
refresh_days = 25
refresh_days_shareprices = 14

In [2]:
#Data collection from Simfin
hub = sf.StockHub(market='us', offset=dateOffset,refresh_days=refresh_days,refresh_days_shareprices=refresh_days_shareprices)
#Creating a panda dataframe with the simfin data
growthSignalsDf = hub.growth_signals(variant='daily')
valueSignalsDf = hub.val_signals(variant='daily')
financialSignalsDf = hub.fin_signals(variant='daily')
#Combining the 3 data frames into one big data frame
dfs = [financialSignalsDf, growthSignalsDf, valueSignalsDf]
signalsDf = pd.concat(dfs, axis=1)

Dataset "us-income-ttm" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (1 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'growth_signals-d4ce5d48.pickle' on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'val_signals-65cf454d.pickle' on disk (1 days old).
- Loading from disk ... Done!
Cache-file 'fin_signals-d4ce5d48.pickle' on disk (1 days old).
- Loading from disk ... Done!


In [3]:
#Drop the rows where all elements are missing.
signalsDf.dropna(how='all').head() 
df = signalsDf.dropna(how='all').reset_index(drop=True)

#Columns must have atleast 80% non NULL values, any that don't are dropped
#(Scikit cannot work well with lots of missing data)
thresh = 0.80 * len(signalsDf.dropna(how='all'))
signalsDf = signalsDf.dropna(axis='columns', thresh=thresh)
signalsDf.dropna(how='all').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,Earnings Yield,FCF Yield,Market-Cap,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Price to Book Value
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2010-09-20,0.171492,0.553297,0.622857,1.966061,-0.0,0.542205,5.636364,9.701999,0.08282,1.065135,...,0.038314,0.041254,10883770000.0,4.697355,26.100174,24.240028,-19.610401,-3.584315,2.161623,3.873229
A,2010-09-21,0.171492,0.553297,0.622857,1.966061,-0.0,0.542205,5.636364,9.701999,0.08282,1.065135,...,0.038166,0.041095,10925860000.0,4.715521,26.201109,24.333769,-19.686239,-3.598176,2.169983,3.888207
A,2010-09-22,0.171492,0.553297,0.622857,1.966061,-0.0,0.542205,5.636364,9.701999,0.08282,1.065135,...,0.03865,0.041616,10789070000.0,4.656483,25.87307,24.029109,-19.439766,-3.553127,2.142814,3.839527
A,2010-09-23,0.171492,0.553297,0.622857,1.966061,-0.0,0.542205,5.636364,9.701999,0.08282,1.065135,...,0.038903,0.041889,10718920000.0,4.626206,25.704844,23.872873,-19.313369,-3.530025,2.128882,3.814562
A,2010-09-24,0.171492,0.553297,0.622857,1.966061,-0.0,0.542205,5.636364,9.701999,0.08282,1.065135,...,0.037048,0.039891,11255570000.0,4.857819,26.991769,25.068079,-20.280302,-3.706757,2.235465,4.00554


In [4]:
# Name of the new column for the returns.
#This is the column the AI will attempt to predict
TOTAL_RETURN_1_3Y = 'Total Log Return 1-3 Years'
# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,future=True, annualized=True,min_years=1, max_years=3)
#combine the two dataframes together
dfs = [signalsDf, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

Cache-file 'mean_log_change-cb2ba940.pickle' on disk (0 days old).
- Loading from disk ... Done!


In [5]:
# Remove data outliers by winsorizing both the original stock data and the the Total return column
df_sig_rets = sf.winsorize(df_sig_rets)
# Remove all rows with any missing values
df_sig_rets = df_sig_rets.dropna(how='any')
# Remove all Stocks which have less than 150 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER).filter(lambda df: len(df)>150)

In [6]:
df_sig_rets.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Log Revenue,Net Profit Margin,Quick Ratio,...,FCF Yield,Market-Cap,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales,Price to Book Value,Total Log Return 1-3 Years
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2011-09-19,0.18595,0.738375,0.603113,3.470432,-0.0,0.53257,13.927536,9.810434,0.157048,2.669767,...,0.077832,12437020000.0,4.010648,12.253222,12.848161,18.562716,-31.818776,1.924342,2.961195,0.164458
A,2011-09-20,0.18595,0.738375,0.603113,3.470432,-0.0,0.53257,13.927536,9.810434,0.157048,2.669767,...,0.077632,12469040000.0,4.020973,12.284766,12.881237,18.610504,-31.818776,1.929296,2.968818,0.16345
A,2011-09-21,0.18595,0.738375,0.603113,3.470432,-0.0,0.53257,13.927536,9.810434,0.157048,2.669767,...,0.084372,11472940000.0,3.699754,11.303387,11.852208,17.123787,-31.818776,1.775172,2.731652,0.209664
A,2011-09-22,0.18595,0.738375,0.603113,3.470432,-0.0,0.53257,13.927536,9.810434,0.157048,2.669767,...,0.086961,11131420000.0,3.589622,10.966914,11.499398,16.614056,-31.818776,1.72233,2.650338,0.226803
A,2011-09-23,0.18595,0.738375,0.603113,3.470432,-0.0,0.53257,13.927536,9.810434,0.157048,2.669767,...,0.085891,11270160000.0,3.634363,11.103606,11.642727,16.821134,-31.818776,1.743797,2.683371,0.220432


In [7]:
df_sig_rets.columns

Index(['(Dividends + Share Buyback) / FCF', 'Asset Turnover',
       'CapEx / (Depr + Amor)', 'Current Ratio', 'Dividends / FCF',
       'Gross Profit Margin', 'Interest Coverage', 'Log Revenue',
       'Net Profit Margin', 'Quick Ratio', 'Return on Assets',
       'Return on Equity', 'Share Buyback / FCF', 'Assets Growth',
       'Assets Growth QOQ', 'Assets Growth YOY', 'Earnings Growth',
       'Earnings Growth QOQ', 'Earnings Growth YOY', 'FCF Growth',
       'FCF Growth QOQ', 'FCF Growth YOY', 'Sales Growth', 'Sales Growth QOQ',
       'Sales Growth YOY', 'Earnings Yield', 'FCF Yield', 'Market-Cap',
       'P/Cash', 'P/E', 'P/FCF', 'P/NCAV', 'P/NetNet', 'P/Sales',
       'Price to Book Value', 'Total Log Return 1-3 Years'],
      dtype='object')

In [8]:
#Originally was saving to a csv but was having troubles keeping the dataframe original
#df_sig_rets.to_csv (r'/mnt/c/Users/danie/Documents/319 A2/simfin-tutorials/stockdata.csv', header=True)

In [9]:
#Serializes the data frame exactly as it is to storage
df_sig_rets.to_pickle("./stockdata.pkl")

In [13]:
!jupyter nbconvert --to script *.ipynb

[NbConvertApp] Converting notebook 10_Neural_Networks.ipynb to script
[NbConvertApp] Writing 31883 bytes to 10_Neural_Networks.py
[NbConvertApp] Converting notebook Collection and Cleansing.ipynb to script
[NbConvertApp] Writing 2992 bytes to Collection and Cleansing.py
[NbConvertApp] Converting notebook LoadAndTrainData.ipynb to script
[NbConvertApp] Writing 6490 bytes to LoadAndTrainData.py


In [12]:
print(len(df_sig_rets.columns))

36
