In [3]:
!pip install pandas-datareader

In [11]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import os

PATH = os.getcwd()

In [5]:
#Grab sentiments
#Big-cap
big_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USbig_Sent12_21.csv'))
big_sent_all['date'] = pd.to_datetime(big_sent_all['date']).dt.date
#Mid-cap
mid_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USmed_Sent12_21.csv'))
mid_sent_all['date'] = pd.to_datetime(mid_sent_all['date']).dt.date
#Small-caps
small_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USsmall_Sent12_21.csv'))
small_sent_all['date'] = pd.to_datetime(small_sent_all['date']).dt.date

## Sentiment Indicators

In [6]:
## The sentiment dataset in the 'dataSent12_21Good' folder is in long format, with each column being a different sentiment indicator.
# We create a table for each sentiment indicator, and pivot them to into familiar wide format:
# Each table will align all stocks by date and those which has no data for given date will be fill with NaN

#Make a table for each sentiment indicator
big_sent_tables = {}
big_sent_nan_tables = {}
for i in big_sent_all.columns[2:]:   
    big_sent_pivot = big_sent_all.pivot(index="date", columns="stock", values= i)
    big_sent_pivot.index = pd.to_datetime(big_sent_pivot.index)
    big_sent_tables['big_'+i] = big_sent_pivot
    big_sent_pivot.to_csv(os.path.join(PATH,'Tables','big_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

print('List of tables created: ',big_sent_tables.keys())
print('RCV Table:')
big_sent_tables['big_RCV'].head()

List of tables created:  dict_keys(['big_RCV', 'big_RVT', 'big_positivePartscr', 'big_negativePartscr', 'big_splogscr', 'big_linscr'])
RCV Table:


stock,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,,,,,,,,,,,...,0.0,,,,,,,,,0.0
2012-01-03,,0.0,,0.0,0.0,,,,,,...,2.632,,,0.0,,0.0,0.0,0.0,,33.333
2012-01-04,0.0,41.667,,44.444,-14.286,0.0,0.0,0.0,0.0,0.0,...,47.692,0.0,0.0,30.0,0.0,22.222,0.0,37.5,0.0,43.478
2012-01-05,25.0,45.455,,0.0,33.333,38.889,13.333,-7.692,4.167,-9.091,...,38.571,0.0,25.0,57.333,38.462,49.383,51.852,48.148,-20.0,38.889
2012-01-06,46.666,45.395,,-33.333,-73.333,57.384,-60.0,-43.75,41.935,-35.714,...,26.25,33.335,-57.142,27.941,-28.571,14.706,36.765,54.412,,20.0


In [30]:
#Same procedure for mid and small-cap

#Mid companies
mid_sent_tables = {}
mid_sent_nan_tables = {}
for i in mid_sent_all.columns[2:]:   
    mid_sent_pivot = mid_sent_all.pivot(index="date", columns="stock", values= i)
    mid_sent_pivot.index = pd.to_datetime(mid_sent_pivot.index)
    mid_sent_tables['mid_'+i] = mid_sent_pivot
    mid_sent_pivot.to_csv(os.path.join(PATH,'Tables','mid_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder
     
#Small companies
small_sent_tables = {}
small_sent_nan_tables = {}
for i in small_sent_all.columns[2:]:   
    small_sent_pivot = small_sent_all.pivot(index="date", columns="stock", values= i)
    small_sent_pivot.index = pd.to_datetime(small_sent_pivot.index)
    small_sent_tables['small_'+i] = small_sent_pivot
    small_sent_pivot.to_csv(os.path.join(PATH,'Tables','small_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

## Stock Price Data

In [40]:
big_tickers = big_sent_tables['big_RCV'].columns
mid_tickers = mid_sent_tables['mid_RCV'].columns
small_tickers = small_sent_tables['small_RCV'].columns

In [34]:
# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
big_tickers = big_sent_tables['big_RCV'].columns

# We would like all available data from 2012/01/02 until 2021/12/01.
start_date = big_sent_tables['big_RCV'].index[0].strftime('%Y-%m-%d')
end_date = big_sent_tables['big_RCV'].index[-1].strftime('%Y-%m-%d')

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
big_panel_data = web.DataReader(big_tickers, 'yahoo', start_date, end_date)['Adj Close']
big_panel_data = pd.DataFrame(big_panel_data)
big_panel_data.to_csv(os.path.join(PATH,'Tables','big_prices.csv')) #Store in csv format in the 'Tables' folder
big_panel_data.head()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,4.826835,12.575917,,32.344158,22.006762,37.695107,19.836969,5.48,179.029999,41.646698,...,18.9,43.937672,54.859112,20.828667,23.966467,24.743982,21.174944,47.71207,12.974882,56.468849
2012-01-04,4.741989,12.643497,,32.454609,21.921402,37.653576,19.71335,5.46,177.509995,41.672611,...,18.52,44.577663,54.622379,20.821117,23.538284,24.420115,21.27177,47.221729,12.832223,56.481983
2012-01-05,5.156796,12.783868,,32.700993,21.870958,37.930435,19.705111,5.46,177.610001,42.156384,...,18.389999,44.876331,54.089779,21.130756,23.715145,24.251963,21.614376,46.99239,12.757505,56.311283
2012-01-06,5.279352,12.917508,,32.78595,21.67309,37.985794,19.400179,5.43,182.610001,41.698524,...,18.209999,45.038471,54.356079,20.957052,23.435892,24.18256,21.554794,46.660221,12.920539,55.891048
2012-01-09,5.392481,12.897017,,32.768948,21.669199,37.840454,19.77928,5.59,178.559998,41.802185,...,17.93,44.987259,54.333881,21.327106,23.2474,24.207788,21.822927,46.802574,12.764293,56.140568


In [31]:
#Same procedure for mid and small
#Mid-caps
mid_tickers = mid_sent_tables['mid_RCV'].columns
start_date = mid_sent_tables['mid_RCV'].index[0].strftime('%Y-%m-%d')
end_date = mid_sent_tables['mid_RCV'].index[-1].strftime('%Y-%m-%d')
mid_panel_data = web.DataReader(mid_tickers, 'yahoo', start_date, end_date)['Adj Close']
mid_panel_data = pd.DataFrame(mid_panel_data)
mid_panel_data.to_csv(os.path.join(PATH,'Tables','mid_prices.csv')) #Store in csv format in the 'Tables' folder

#Small-caps
small_tickers = small_sent_tables['small_RCV'].columns
start_date = small_sent_tables['small_RCV'].index[0].strftime('%Y-%m-%d')
end_date = small_sent_tables['small_RCV'].index[-1].strftime('%Y-%m-%d')
small_panel_data = web.DataReader(small_tickers, 'yahoo', start_date, end_date)['Adj Close']
small_panel_data = pd.DataFrame(small_panel_data)
small_panel_data.to_csv(os.path.join(PATH,'Tables','small_prices.csv')) #Store in csv format in the 'Tables' folder



In [35]:
mid_sent_tables['mid_RCV'].columns

Index(['AB', 'ACC', 'ADS', 'AI', 'AR', 'BMI', 'CC', 'CMC', 'CNS', 'CWT', 'EPC',
       'FL', 'FORM', 'GPS', 'LEG', 'M', 'MAC', 'MD', 'NOV', 'PCI', 'PNW',
       'RMBS', 'SEB', 'SF', 'SIX', 'SM', 'TCO', 'TRN', 'VAC', 'VC', 'WIRE'],
      dtype='object', name='stock')