In [3]:
!pip install pandas-datareader

In [11]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import os

PATH = os.getcwd()

In [5]:
#Grab sentiments
#Big-cap
big_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USbig_Sent12_21.csv'))
big_sent_all['date'] = pd.to_datetime(big_sent_all['date']).dt.date
#Mid-cap
mid_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USmed_Sent12_21.csv'))
mid_sent_all['date'] = pd.to_datetime(mid_sent_all['date']).dt.date
#Small-caps
small_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USsmall_Sent12_21.csv'))
small_sent_all['date'] = pd.to_datetime(small_sent_all['date']).dt.date

## Sentiment Indicators

In [6]:
## The sentiment dataset in the 'dataSent12_21Good' folder is in long format, with each column being a different sentiment indicator.
# We create a table for each sentiment indicator, and pivot them to into familiar wide format:
# Each table will align all stocks by date and those which has no data for given date will be fill with NaN

#Make a table for each sentiment indicator
big_sent_tables = {}
big_sent_nan_tables = {}
for i in big_sent_all.columns[2:]:   
    big_sent_pivot = big_sent_all.pivot(index="date", columns="stock", values= i)
    big_sent_pivot.index = pd.to_datetime(big_sent_pivot.index)
    big_sent_tables['big_'+i] = big_sent_pivot
    big_sent_pivot.to_csv(os.path.join(PATH,'Tables','big_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

print('List of tables created: ',big_sent_tables.keys())
print('RCV Table:')
big_sent_tables['big_RCV'].head()

List of tables created:  dict_keys(['big_RCV', 'big_RVT', 'big_positivePartscr', 'big_negativePartscr', 'big_splogscr', 'big_linscr'])
RCV Table:


stock,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,,,,,,,,,,,...,0.0,,,,,,,,,0.0
2012-01-03,,0.0,,0.0,0.0,,,,,,...,2.632,,,0.0,,0.0,0.0,0.0,,33.333
2012-01-04,0.0,41.667,,44.444,-14.286,0.0,0.0,0.0,0.0,0.0,...,47.692,0.0,0.0,30.0,0.0,22.222,0.0,37.5,0.0,43.478
2012-01-05,25.0,45.455,,0.0,33.333,38.889,13.333,-7.692,4.167,-9.091,...,38.571,0.0,25.0,57.333,38.462,49.383,51.852,48.148,-20.0,38.889
2012-01-06,46.666,45.395,,-33.333,-73.333,57.384,-60.0,-43.75,41.935,-35.714,...,26.25,33.335,-57.142,27.941,-28.571,14.706,36.765,54.412,,20.0


In [30]:
#Same procedure for mid and small-cap

#Mid companies
mid_sent_tables = {}
mid_sent_nan_tables = {}
for i in mid_sent_all.columns[2:]:   
    mid_sent_pivot = mid_sent_all.pivot(index="date", columns="stock", values= i)
    mid_sent_pivot.index = pd.to_datetime(mid_sent_pivot.index)
    mid_sent_tables['mid_'+i] = mid_sent_pivot
    mid_sent_pivot.to_csv(os.path.join(PATH,'Tables','mid_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder
     
#Small companies
small_sent_tables = {}
small_sent_nan_tables = {}
for i in small_sent_all.columns[2:]:   
    small_sent_pivot = small_sent_all.pivot(index="date", columns="stock", values= i)
    small_sent_pivot.index = pd.to_datetime(small_sent_pivot.index)
    small_sent_tables['small_'+i] = small_sent_pivot
    small_sent_pivot.to_csv(os.path.join(PATH,'Tables','small_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

## Stock Price Data

In [40]:
big_tickers = big_sent_tables['big_RCV'].columns
mid_tickers = mid_sent_tables['mid_RCV'].columns
small_tickers = small_sent_tables['small_RCV'].columns

In [54]:
# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
big_tickers = big_sent_tables['big_RCV'].columns

# We would like all available data from 2012/01/02 until 2021/12/01.
start_date = big_sent_tables['big_RCV'].index[0].strftime('%Y-%m-%d') 
end_date = big_sent_tables['big_RCV'].index[-1].strftime('%Y-%m-%d')

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
big_panel_data = web.DataReader(big_tickers, 'yahoo', start_date, end_date)['Adj Close']
big_panel_data = pd.DataFrame(big_panel_data)
big_panel_data.to_csv(os.path.join(PATH,'Tables','big_prices.csv')) #Store in csv format in the 'Tables' folder
big_panel_data.head()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,4.826837,12.575914,,32.344151,22.00676,37.695099,19.836967,5.48,179.029999,41.64669,...,18.9,43.874683,54.859089,20.998516,23.917355,25.045921,21.17494,47.646908,12.974881,56.468861
2012-01-04,4.74199,12.643499,,32.45459,21.921396,37.65358,19.713348,5.46,177.509995,41.672607,...,18.52,44.513767,54.622387,20.990902,23.490055,24.718109,21.27177,47.157242,12.832222,56.482006
2012-01-05,5.156796,12.783869,,32.701008,21.870964,37.930439,19.705109,5.46,177.610001,42.156361,...,18.389999,44.811993,54.089775,21.303066,23.666553,24.547903,21.61438,46.928204,12.757502,56.311279
2012-01-06,5.279351,12.917507,,32.785946,21.673082,37.985809,19.400179,5.43,182.610001,41.698524,...,18.209999,44.973885,54.356098,21.127947,23.387878,24.477661,21.554792,46.596504,12.920537,55.891048
2012-01-09,5.392481,12.897018,,32.768963,21.669209,37.840454,19.77928,5.59,178.559998,41.802189,...,17.93,44.92276,54.333897,21.50102,23.199772,24.503195,21.822926,46.738667,12.764295,56.140572


In [31]:
#Same procedure for mid and small
#Mid-caps
mid_tickers = mid_sent_tables['mid_RCV'].columns
start_date = mid_sent_tables['mid_RCV'].index[0].strftime('%Y-%m-%d')
end_date = mid_sent_tables['mid_RCV'].index[-1].strftime('%Y-%m-%d')
mid_panel_data = web.DataReader(mid_tickers, 'yahoo', start_date, end_date)['Adj Close']
mid_panel_data = pd.DataFrame(mid_panel_data)
mid_panel_data.to_csv(os.path.join(PATH,'Tables','mid_prices.csv')) #Store in csv format in the 'Tables' folder

#Small-caps
small_tickers = small_sent_tables['small_RCV'].columns
start_date = small_sent_tables['small_RCV'].index[0].strftime('%Y-%m-%d')
end_date = small_sent_tables['small_RCV'].index[-1].strftime('%Y-%m-%d')
small_panel_data = web.DataReader(small_tickers, 'yahoo', start_date, end_date)['Adj Close']
small_panel_data = pd.DataFrame(small_panel_data)
small_panel_data.to_csv(os.path.join(PATH,'Tables','small_prices.csv')) #Store in csv format in the 'Tables' folder



In [36]:
big_panel_data.describe()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
count,2496.0,2496.0,2246.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,...,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0,2496.0
mean,29.864295,46.16011,61.997998,76.416241,55.659758,100.344836,43.323222,23.645052,1228.320244,85.911409,...,53.956466,172.294392,98.166874,38.834891,106.073677,40.715786,39.70327,82.84187,24.365201,59.779399
std,13.140682,37.722773,23.836315,22.53355,29.300164,48.342182,9.656086,31.23251,1038.245263,30.266216,...,20.828467,108.272148,35.709151,9.478461,63.411553,8.97746,8.88929,29.076663,4.94491,8.571622
min,4.741989,12.119164,22.828451,30.649113,21.190903,35.505131,17.696321,1.62,175.929993,41.646698,...,17.48,42.964878,53.17873,20.821117,23.063564,23.475946,20.667673,45.666851,12.451621,27.525698
25%,17.046335,21.057366,42.971786,64.175076,33.998127,59.067038,37.948011,3.64,332.405006,65.417988,...,36.995,72.265486,78.47537,32.564281,51.106833,34.01168,32.579025,61.912704,21.324682,57.67201
50%,31.918802,29.300717,56.089691,79.780987,41.15303,86.877579,45.971624,9.76,810.26001,79.102673,...,53.110001,147.887978,90.934677,36.255188,79.539494,39.075941,42.306732,68.747372,24.10357,61.477324
75%,40.464745,51.530746,78.932829,88.812967,75.509634,138.59026,50.565258,29.547501,1820.107483,100.625977,...,69.524998,244.878498,104.297586,46.184116,159.994114,50.101741,46.282361,101.492723,27.162333,65.239042
max,56.988731,165.089676,117.112343,127.405067,129.912155,235.718933,60.390545,161.910004,3731.409912,186.083435,...,96.699997,461.474823,215.432541,61.726746,249.741699,57.657818,58.148548,150.26149,38.122612,73.198761


## Log Retruns

In [55]:
#Log return:
big_log_ret = np.log(big_panel_data) - np.log(big_panel_data.shift(1))
big_log_ret.to_csv(os.path.join(PATH,'Tables','big_log_ret.csv')) #Store in csv format in the 'Tables' folder

#Percentage change:
#big_pct_change = big_panel_data.pct_change()
#big_pct_changet.to_csv(os.path.join(PATH,'Tables','big_pct_change.csv'))

big_log_ret.head()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,,,,,,,,,,,...,,,,,,,,,,
2012-01-04,-0.017735,0.00536,,0.003409,-0.003887,-0.001102,-0.006251,-0.003656,-0.008526,0.000622,...,-0.020311,0.014461,-0.004324,-0.000363,-0.018027,-0.013175,0.004562,-0.01033,-0.011056,0.000233
2012-01-05,0.083859,0.011041,,0.007564,-0.002303,0.007326,-0.000418,0.0,0.000563,0.011542,...,-0.007044,0.006677,-0.009799,0.014762,0.007486,-0.00691,0.015978,-0.004869,-0.00584,-0.003027
2012-01-06,0.023488,0.010399,,0.002594,-0.009089,0.001459,-0.015596,-0.00551,0.027763,-0.01092,...,-0.009836,0.003606,0.004912,-0.008254,-0.011845,-0.002866,-0.002761,-0.007093,0.012699,-0.007491
2012-01-09,0.021202,-0.001587,,-0.000518,-0.000179,-0.003834,0.019353,0.02904,-0.022428,0.002483,...,-0.015496,-0.001137,-0.000409,0.017504,-0.008075,0.001043,0.012363,0.003046,-0.012166,0.004455


In [56]:
#Same for mid and small-caps
#Log return:
mid_log_ret = np.log(mid_panel_data) - np.log(mid_panel_data.shift(1))
mid_log_ret.to_csv(os.path.join(PATH,'Tables','mid_log_ret.csv')) #Store in csv format in the 'Tables' folder
small_log_ret = np.log(small_panel_data) - np.log(small_panel_data.shift(1))
small_log_ret.to_csv(os.path.join(PATH,'Tables','small_log_ret.csv')) #Store in csv format in the 'Tables' folder

#Percentage change:
#mid_pct_change = mid_panel_data.pct_change()
#mid_pct_changet.to_csv(os.path.join(PATH,'Tables','mid_pct_change.csv'))
#small_pct_change = small_panel_data.pct_change()
#small_pct_changet.to_csv(os.path.join(PATH,'Tables','small_pct_change.csv'))