In [11]:
import numpy as np
import pandas as pd
from yahoofinancials import YahooFinancials
import os

PATH = os.getcwd()

In [21]:
#Grab sentiments
#Big-cap
big_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USbig_Sent12_21.csv'))
big_sent_all['date'] = pd.to_datetime(big_sent_all['date']).dt.date
#Mid-cap
mid_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USmed_Sent12_21.csv'))
mid_sent_all['date'] = pd.to_datetime(mid_sent_all['date']).dt.date
#Small-caps
small_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USsmall_Sent12_21.csv'))
small_sent_all['date'] = pd.to_datetime(small_sent_all['date']).dt.date

## Sentiment Indicators

In [27]:
## The sentiment dataset in the 'dataSent12_21Good' folder is in long format, with each column being a different sentiment indicator.
# We create a table for each sentiment indicator, and pivot them to into familiar wide format:
# Each table will align all stocks by date and those which has no data for given date will be fill with NaN

#Make a table for each sentiment indicator
big_sent_tables = {}
big_sent_nan_tables = {}
for i in big_sent_all.columns[2:]:   
    big_sent_pivot = big_sent_all.pivot(index="date", columns="stock", values= i)
    big_sent_pivot.index = pd.to_datetime(big_sent_pivot.index)
    big_sent_tables['big_'+i] = big_sent_pivot
    big_sent_pivot.to_csv(os.path.join(PATH,'Tables','big_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

print('List of tables created: ',big_sent_tables.keys())
print('RCV Table:')
big_sent_tables['big_RCV'].head()

List of tables created:  dict_keys(['big_RCV', 'big_RVT', 'big_positivePartscr', 'big_negativePartscr', 'big_splogscr', 'big_linscr'])
RCV Table:


stock,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,,,,,,,,,,,...,0.0,,,,,,,,,0.0
2012-01-03,,0.0,,0.0,0.0,,,,,,...,2.632,,,0.0,,0.0,0.0,0.0,,33.333
2012-01-04,0.0,41.667,,44.444,-14.286,0.0,0.0,0.0,0.0,0.0,...,47.692,0.0,0.0,30.0,0.0,22.222,0.0,37.5,0.0,43.478
2012-01-05,25.0,45.455,,0.0,33.333,38.889,13.333,-7.692,4.167,-9.091,...,38.571,0.0,25.0,57.333,38.462,49.383,51.852,48.148,-20.0,38.889
2012-01-06,46.666,45.395,,-33.333,-73.333,57.384,-60.0,-43.75,41.935,-35.714,...,26.25,33.335,-57.142,27.941,-28.571,14.706,36.765,54.412,,20.0


In [34]:
#Same procedure for mid and small-cap

#Mid companies
mid_sent_tables = {}
mid_sent_nan_tables = {}
for i in mid_sent_all.columns[2:]:   
    mid_sent_pivot = mid_sent_all.pivot(index="date", columns="stock", values= i)
    mid_sent_pivot.index = pd.to_datetime(mid_sent_pivot.index)
    mid_sent_tables['mid_'+i] = mid_sent_pivot
    mid_sent_pivot.to_csv(os.path.join(PATH,'Tables','mid_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder
     
#Small companies
small_sent_tables = {}
small_sent_nan_tables = {}
for i in small_sent_all.columns[2:]:   
    small_sent_pivot = small_sent_all.pivot(index="date", columns="stock", values= i)
    small_sent_pivot.index = pd.to_datetime(small_sent_pivot.index)
    small_sent_tables['small_'+i] = small_sent_pivot
    small_sent_pivot.to_csv(os.path.join(PATH,'Tables','small_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

## Stock Price Data

In [40]:
big_tickers = big_sent_tables['big_RCV'].columns
mid_tickers = mid_sent_tables['mid_RCV'].columns
small_tickers = small_sent_tables['small_RCV'].columns

In [49]:
big_sent_tables['big_RCV'].index[-1].strftime('%Y-%m-%d')

'2021-12-01'

In [51]:
all_tickers = big_sent_tables['big_RCV'].columns
st_date = big_sent_tables['big_RCV'].index[0].strftime('%Y-%m-%d')
en_date = big_sent_tables['big_RCV'].index[-1].strftime('%Y-%m-%d')

In [None]:
# extracting stock data (historical close price) for the stocks identified
#def extract_stock_adj(beg_date, end_date, all_tickers):
close_prices = pd.DataFrame()
end_date = st_date
beg_date = en_date
cp_tickers = all_tickers
attempt = 0
drop = []
while len(cp_tickers) != 0 and attempt <=5:
    print("-----------------")
    print("attempt number ",attempt)
    print("-----------------")
    cp_tickers = [j for j in cp_tickers if j not in drop]
    for i in range(len(cp_tickers)):
        try:
            yahoo_financials = YahooFinancials(cp_tickers[i]) #Initiate an object YahooFinance(<Company ticker>)
            json_obj = yahoo_financials.get_historical_price_data(beg_date,end_date,"daily") #Returns a json object (dictionary) with all the data
            ohlv = json_obj[cp_tickers[i]]['prices']  #We just want the prices of all that data. Still, each element in the list ohlv is a dictionary with the values: value, open, low, high,...
            temp = pd.DataFrame(ohlv)[["formatted_date","adjclose"]] #with pandas you can directly restructure a list of dictionaries into a dataframe made of the elements you want
            temp.set_index("formatted_date",inplace=True) #set formatted_date to be the indexes of the dataframe
            temp = temp[~temp.index.duplicated(keep='first')] #For some reason, some of the elements of ohlv are the dividends/payouts prices. These are dictionaries of 5 elements and one of them are Formatted_date. We therefore have some duplicated indexes that have no prices associated. We need to get rid of the second duplicates. This would return a list of booleans where those second duplicates have a true, but since we put the ~ we get the opposite . 
            #Update: In the last version of YahooFinancials I think the problem explained in the line above has been fixed
            close_prices[cp_tickers[i]] = temp["adjclose"]
            drop.append(cp_tickers[i])       
        except:
            print(cp_tickers[i]," :failed to fetch data...retrying")
            continue
    attempt+=1

-----------------
attempt number  0
-----------------
AAL  :failed to fetch data...retrying
AAPL  :failed to fetch data...retrying
ABBV  :failed to fetch data...retrying
ABC  :failed to fetch data...retrying
ABT  :failed to fetch data...retrying
ADP  :failed to fetch data...retrying
AIG  :failed to fetch data...retrying
AMD  :failed to fetch data...retrying
AMZN  :failed to fetch data...retrying
AXP  :failed to fetch data...retrying
BAC  :failed to fetch data...retrying
