In [None]:
%matplotlib inline
import pandas as pd
import os
# Import the main functionality from the SimFin Python API.
import simfin as sf

# Import names used for easy access to SimFin's data-columns.
from simfin.names import *

import yfinance as yf
import yahoo_fin.stock_info as si
# import pandas_datareader.data as web

from dateutil.relativedelta import relativedelta
from datetime import datetime

In [None]:
# sample base and starting date for training
tickers_sp500 = list(si.tickers_sp500())
start_date = '2016-01-01'
end_date = '2021-12-31'

In [None]:
tickers_sp500

In [None]:
cwd = os.getcwd()
cwd

In [None]:
sf.set_data_dir('/Users/feiyiyang/Documents/Bootcamp/Team9-Project2/Bootcamp-Project2/simfin_data/')

In [None]:
sf.load_api_key(path='/Users/feiyiyang/Documents/Bootcamp/Team9-Project2/Bootcamp-Project2/simfin_data/simfin_api_key.txt')

### Part 1: Grab quarterly fundamentals from Simfin API

In [None]:
df_income = sf.load(dataset='income', variant='quarterly', market='us')
df_balance = sf.load(dataset='balance', variant='quarterly', market='us')
df_cashflow = sf.load(dataset='cashflow', variant='quarterly', market='us')

In [None]:
df_cashflow.columns

In [None]:
df_fundamentals = pd.merge(df_income[['Ticker','Fiscal Year','Fiscal Period','Report Date', 'Publish Date','Revenue','Gross Profit','Operating Income (Loss)','Net Income']], 
                           df_balance[['Ticker','Fiscal Year','Fiscal Period','Shares (Basic)','Cash, Cash Equivalents & Short Term Investments','Total Assets','Total Liabilities','Total Equity']],  
                           how='left', 
                           on=['Ticker','Fiscal Year','Fiscal Period'])

In [None]:
df_fundamentals= pd.merge(df_fundamentals,
                          df_cashflow[['Ticker','Fiscal Year','Fiscal Period','Net Cash from Operating Activities','Net Change in Long Term Investment','Net Cash from Investing Activities', 'Dividends Paid', 'Net Cash from Financing Activities', 'Net Change in Cash']],
                           how='left', 
                           on=['Ticker','Fiscal Year','Fiscal Period'])

In [None]:
df_fundamentals.tail()

In [None]:
# add a column for next earning date - to set the start/end date for append daily price
df_fundamentals['Next_publish_date']=df_fundamentals.groupby("Ticker")['Publish Date'].shift(-1)
u = (pd.to_datetime(df_fundamentals['Publish Date']) + pd.DateOffset(months=3)).dt.date
df_fundamentals.Next_publish_date.fillna(u,inplace=True)

In [None]:
df_fundamentals.to_csv('data/us_fundamental.csv',index=False)

### Part 2: Grab EARNING DATA from yahoo finance API (yahoo_fin.stock_info)

In [None]:
# load fundamental data
df_fundamentals=pd.read_csv('data/us_fundamental.csv')

In [None]:
# dftest=pd.DataFrame.from_dict(si.get_earnings_history("A"))[['ticker','startdatetime','epsestimate','epsactual','epssurprisepct']]
# dftest.startdatetime=pd.to_datetime(dftest.startdatetime).dt.date
# dftest=dftest[dftest['startdatetime']>pd.to_datetime(start_date).date()]
# dftest['earning_y']= pd.DatetimeIndex(dftest['startdatetime']).year
# dftest['earning_m']= pd.DatetimeIndex(dftest['startdatetime']).month
# dftest.sort_values(by=['startdatetime'],inplace=True)
# dftest

In [None]:
earningdata_1=[]
notmatch=[]
for ticker in tickers_sp500:
    try:
        # print(ticker)
        dftest=pd.DataFrame.from_dict(si.get_earnings_history(ticker))[['ticker','startdatetime','epsestimate','epsactual','epssurprisepct']]
        dftest.startdatetime=pd.to_datetime(dftest.startdatetime).dt.date
        dftest=dftest[dftest['startdatetime']>pd.to_datetime(start_date).date()]
        dftest['earning_y']= pd.DatetimeIndex(dftest['startdatetime']).year
        dftest['earning_m']= pd.DatetimeIndex(dftest['startdatetime']).month
        dftest.sort_values(by=['startdatetime'],inplace=True)

        df1=df_fundamentals[df_fundamentals['Ticker']==ticker][['Ticker','Publish Date']]
        df1['Publish_y']=pd.DatetimeIndex(df1['Publish Date']).year
        df1['Publish_m']=pd.DatetimeIndex(df1['Publish Date']).month
    except:
        print(f"No data available for {ticker}")
        notmatch.append(ticker)
    else:
        merge= pd.merge(df1,dftest,how='left',left_on=('Ticker','Publish_y'),right_on=('ticker','earning_y'))
        merge= merge[(merge.earning_m == merge.Publish_m) | (merge.earning_m == merge.Publish_m-1)]
        earningdata_1.append(merge)
    

df_sp500_earning = pd.concat(earningdata_1)

In [None]:
df_sp500_earning=df_sp500_earning.reset_index()
df_sp500_earning.drop(columns=['index','earning_y','earning_m','Publish_y','Publish_m','ticker'],inplace=True)
df_sp500_earning.rename(columns={'startdatetime':'Earning Date'},inplace=True)
df_sp500_earning

In [None]:
df_sp500_earning.to_csv('data/sp500_earningdate.csv',index=False)

### Part 3: Grab daily stock price from yahoo finance API (yahoo_fin.stock_info)

In [None]:
# load fundamental data
df_fundamentals=pd.read_csv('data/us_fundamental.csv')
df_sp500_earning=pd.read_csv('data/sp500_earningdate.csv')

In [None]:
# append daily price data to fundamental where the dACC_NOTES_RECVate is within the current publish date and next publish date
price=[]
for ticker in tickers_sp500:
    try:
        # df1=df_fundamentals[df_fundamentals['Ticker']==ticker]
        df2=si.get_data(ticker , start_date = start_date, end_date=end_date,index_as_date=False)
    except:
        print(f"No data available for {ticker}")
    else:
        # df_merge=pd.merge(df1,df2,how='left',left_on='Ticker',right_on='ticker')
        # df_merge= df_merge[(df_merge.date > df_merge['Publish Date']) & (df_merge.date <= df_merge['Next_publish_date'])]
        price.append(df2)
sp500_price = pd.concat(price)
sp500_price.to_csv('data/sp500_dailyprice.csv',index=False)

In [None]:
sp500_price

### Part 4: Append additional data source

#### 1.Industry/Sector

In [None]:
# load fundamental data
df_sp500=pd.read_csv('data/sp500_fundamental_dailyprice.csv')

In [None]:
df_industry_sector=pd.DataFrame(list(df_sp500.Ticker.unique()),columns=['Ticker'])

In [None]:
test=df_industry_sector.iloc[:5]

In [None]:
df2 = {'Ticker': 'KK'}
test = test.append(df2, ignore_index = True)
test

In [None]:
sector_df= pd.DataFrame(columns=['Ticker', 'Sector', 'Industry'])

for ticker in df_industry_sector.Ticker.unique():
    try:
        sector_df=sector_df.append({
            'Ticker':ticker,
            'Sector':si.get_company_info(ticker).loc['sector'].Value,
            'Industry':si.get_company_info(ticker).loc['industry'].Value
        }, ignore_index=True)
        # print(ticker)
    except:
        print(f"No industry data available for {ticker}")


In [None]:
sector_df.tail()

In [None]:
sector_df.to_csv('data/industry_sector.csv',index=False)

In [None]:
df_sp500=pd.merge(df_sp500,sector_df,how='left', on='Ticker')

In [None]:
len(df_industry_sector.Ticker.unique())

#### 2.Earning Surprise

In [None]:
# dftest=pd.DataFrame.from_dict(si.get_earnings_history("TSLA"))[['ticker','startdatetime','epsestimate','epsactual','epssurprisepct']]
# dftest.startdatetime=pd.to_datetime(dftest.startdatetime).dt.date
# dftest=dftest[dftest['startdatetime']>pd.to_datetime("2016-1-1").date()]
# dftest['earningestimate_year']= pd.DatetimeIndex(dftest['startdatetime']).year
# dftest['earningestimate_month']= pd.DatetimeIndex(dftest['startdatetime']).month
# dftest.sort_values(by=['startdatetime'],inplace=True)