In [3]:
!pip install pandas-datareader

In [80]:
import numpy as np
import pandas as pd
import pandas_datareader
import datetime as dt
import os

PATH = os.getcwd()

In [5]:
#Grab sentiments
#Big-cap
big_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USbig_Sent12_21.csv'))
big_sent_all['date'] = pd.to_datetime(big_sent_all['date']).dt.date
#Mid-cap
mid_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USmed_Sent12_21.csv'))
mid_sent_all['date'] = pd.to_datetime(mid_sent_all['date']).dt.date
#Small-caps
small_sent_all = pd.read_csv(os.path.join(PATH, 'dataSent12_21Good', 'USsmall_Sent12_21.csv'))
small_sent_all['date'] = pd.to_datetime(small_sent_all['date']).dt.date

## Sentiment Indicators

In [6]:
## The sentiment dataset in the 'dataSent12_21Good' folder is in long format, with each column being a different sentiment indicator.
# We create a table for each sentiment indicator, and pivot them to into familiar wide format:
# Each table will align all stocks by date and those which has no data for given date will be fill with NaN

#Make a table for each sentiment indicator
big_sent_tables = {}
big_sent_nan_tables = {}
for i in big_sent_all.columns[2:]:   
    big_sent_pivot = big_sent_all.pivot(index="date", columns="stock", values= i)
    big_sent_pivot.index = pd.to_datetime(big_sent_pivot.index)
    big_sent_tables['big_'+i] = big_sent_pivot
    big_sent_pivot.to_csv(os.path.join(PATH,'Tables','big_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

print('List of tables created: ',big_sent_tables.keys())
print('RCV Table:')
big_sent_tables['big_RCV'].head()

List of tables created:  dict_keys(['big_RCV', 'big_RVT', 'big_positivePartscr', 'big_negativePartscr', 'big_splogscr', 'big_linscr'])
RCV Table:


stock,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,,,,,,,,,,,...,0.0,,,,,,,,,0.0
2012-01-03,,0.0,,0.0,0.0,,,,,,...,2.632,,,0.0,,0.0,0.0,0.0,,33.333
2012-01-04,0.0,41.667,,44.444,-14.286,0.0,0.0,0.0,0.0,0.0,...,47.692,0.0,0.0,30.0,0.0,22.222,0.0,37.5,0.0,43.478
2012-01-05,25.0,45.455,,0.0,33.333,38.889,13.333,-7.692,4.167,-9.091,...,38.571,0.0,25.0,57.333,38.462,49.383,51.852,48.148,-20.0,38.889
2012-01-06,46.666,45.395,,-33.333,-73.333,57.384,-60.0,-43.75,41.935,-35.714,...,26.25,33.335,-57.142,27.941,-28.571,14.706,36.765,54.412,,20.0


In [30]:
#Same procedure for mid and small-cap

#Mid companies
mid_sent_tables = {}
mid_sent_nan_tables = {}
for i in mid_sent_all.columns[2:]:   
    mid_sent_pivot = mid_sent_all.pivot(index="date", columns="stock", values= i)
    mid_sent_pivot.index = pd.to_datetime(mid_sent_pivot.index)
    mid_sent_tables['mid_'+i] = mid_sent_pivot
    mid_sent_pivot.to_csv(os.path.join(PATH,'Tables','mid_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder
     
#Small companies
small_sent_tables = {}
small_sent_nan_tables = {}
for i in small_sent_all.columns[2:]:   
    small_sent_pivot = small_sent_all.pivot(index="date", columns="stock", values= i)
    small_sent_pivot.index = pd.to_datetime(small_sent_pivot.index)
    small_sent_tables['small_'+i] = small_sent_pivot
    small_sent_pivot.to_csv(os.path.join(PATH,'Tables','small_{}.csv'.format(i))) #Store in csv format in the 'Tables' folder

## Stock Price Data

In [57]:
big_tickers = big_sent_tables['big_RCV'].columns
mid_tickers = mid_sent_tables['mid_RCV'].columns
small_tickers = small_sent_tables['small_RCV'].columns

In [81]:
# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
big_tickers = big_sent_tables['big_RCV'].columns

# We would like all available data from 2012/01/02 until 2021/12/01.
start_date = big_sent_tables['big_RCV'].index[0].strftime('%Y-%m-%d') 
end_date = big_sent_tables['big_RCV'].index[-1].strftime('%Y-%m-%d')

# User pandas_reader to load the desired data in a weekly format. As simple as that.
big_panel_data = pandas_datareader.yahoo.daily.YahooDailyReader(big_tickers, interval='w', start=start_date, end=end_date).read()['Adj Close'] 
big_panel_data = pd.DataFrame(big_panel_data)
big_panel_data.to_csv(os.path.join(PATH,'Tables','big_prices.csv')) #Store in csv format in the 'Tables' folder
big_panel_data.head()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,5.279352,12.917507,,32.785938,21.673092,37.985821,19.400173,5.43,182.610001,41.543419,...,18.209999,44.973885,54.356098,21.127947,23.387878,24.163364,21.554792,46.596504,12.920537,55.891048
2012-01-09,5.628167,12.838301,,32.114765,21.506245,38.034264,20.562208,5.66,178.419998,42.985676,...,18.43,44.905727,54.859089,22.102499,23.394844,24.854429,22.053812,47.022987,13.735711,55.733452
2012-01-16,6.005263,12.853289,,34.060337,21.820961,39.307827,21.139103,6.42,190.929993,43.227558,...,19.33,44.539322,55.791195,21.881701,23.357691,24.886356,22.746492,48.183941,14.027818,57.447227
2012-01-23,7.711626,13.678369,,33.117287,21.531368,38.304199,20.80945,6.82,195.369995,43.063435,...,23.09,43.47419,56.249825,21.211702,23.466835,23.762419,22.04637,47.947014,13.905539,56.357254
2012-01-30,9.088028,14.057571,,33.329689,21.523539,38.421871,22.391796,7.08,187.679993,45.136692,...,24.969999,43.721306,56.738041,22.231934,24.855572,24.164743,22.813524,48.98951,14.469367,55.759727


In [87]:
#Same procedure for mid and small
#Mid-caps
mid_tickers = mid_sent_tables['mid_RCV'].columns
start_date = mid_sent_tables['mid_RCV'].index[0].strftime('%Y-%m-%d')
end_date = mid_sent_tables['mid_RCV'].index[-1].strftime('%Y-%m-%d')
mid_panel_data = pandas_datareader.yahoo.daily.YahooDailyReader(mid_tickers, interval='w', start=start_date, end=end_date).read()['Adj Close'] 
mid_panel_data = pd.DataFrame(mid_panel_data)
mid_panel_data.to_csv(os.path.join(PATH,'Tables','mid_prices.csv')) #Store in csv format in the 'Tables' folder



In [None]:
#Small-caps
small_tickers = small_sent_tables['small_RCV'].columns
start_date = small_sent_tables['small_RCV'].index[0].strftime('%Y-%m-%d')
end_date = small_sent_tables['small_RCV'].index[-1].strftime('%Y-%m-%d')
small_panel_data = pandas_datareader.yahoo.daily.YahooDailyReader(small_tickers, interval='w', start=start_date, end=end_date).read()['Adj Close'] 
small_panel_data = pd.DataFrame(small_panel_data)
small_panel_data.to_csv(os.path.join(PATH,'Tables','small_prices.csv')) #Store in csv format in the 'Tables' folder

In [85]:
big_panel_data.describe()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
count,518.0,518.0,466.0,518.0,518.0,518.0,518.0,518.0,518.0,518.0,...,518.0,518.0,518.0,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,29.853003,46.20852,62.110322,76.403275,55.778054,100.567579,43.311935,23.769633,1228.443396,86.017446,...,54.004903,173.151839,98.232928,39.183136,106.18131,41.218844,39.71623,82.971628,24.377418,59.714999
std,13.165709,37.902158,23.957483,22.562541,29.474035,48.648669,9.6428,31.566222,1038.071789,30.405968,...,20.889458,109.244225,35.902708,9.563197,63.551685,9.103498,8.890773,29.380165,4.951926,8.584845
min,5.279352,12.119164,23.085785,30.683222,21.506245,35.505142,18.120352,1.67,178.419998,41.543419,...,17.780001,43.47419,52.966904,21.127947,23.357691,23.762419,20.970751,46.430664,12.920537,28.654726
25%,17.079329,21.081757,43.10595,64.160915,33.962798,59.090292,38.240261,3.65,332.880005,65.31468,...,37.3525,72.442148,78.43961,32.884825,51.022538,34.456874,32.530493,61.636763,21.400327,57.597894
50%,31.701315,29.332283,56.279352,79.77747,41.31975,87.409889,45.947001,9.84,809.265015,78.896275,...,53.095001,148.433441,90.945656,36.564413,79.447758,39.436762,42.324791,68.627022,24.132571,61.392267
75%,40.459655,51.459458,79.117142,88.960619,75.740541,138.991325,50.42219,29.515001,1816.377502,100.60191,...,69.429998,246.884869,104.375631,46.582205,159.927586,50.656565,46.344866,102.603951,27.132939,65.256176
max,56.988731,164.560349,116.908707,126.690041,127.927086,235.679321,59.332619,155.410004,3719.340088,186.083435,...,96.699997,458.973022,212.720993,62.230099,247.840775,58.181309,58.148552,150.842651,37.384274,72.813065


## Log Retruns

In [92]:
#Log return:
big_log_ret = np.log(big_panel_data) - np.log(big_panel_data.shift(1))
big_log_ret.to_csv(os.path.join(PATH,'Tables','big_log_ret.csv')) #Store in csv format in the 'Tables' folder

#Percentage change:
#big_pct_change = big_panel_data.pct_change()
#big_pct_changet.to_csv(os.path.join(PATH,'Tables','big_pct_change.csv'))

big_log_ret.head()

Symbols,AAL,AAPL,ABBV,ABC,ABT,ADP,AIG,AMD,AMZN,AXP,...,UAL,UNH,UPS,USB,V,VZ,WFC,WMT,WY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,,,,,,,,,,,...,,,,,,,,,,
2012-01-09,0.06398,-0.006151,,-0.020684,-0.007728,0.001274,0.058173,0.041485,-0.023212,0.034128,...,0.012009,-0.001517,0.009211,0.045094,0.000298,0.028198,0.022887,0.009111,0.061181,-0.002824
2012-01-16,0.064852,0.001167,,0.058818,0.014528,0.032936,0.02767,0.125994,0.067767,0.005611,...,0.047679,-0.008193,0.016848,-0.01004,-0.001589,0.001284,0.030925,0.024389,0.021043,0.030286
2012-01-23,0.250093,0.062216,,-0.028078,-0.01336,-0.025864,-0.015717,0.060441,0.022988,-0.003804,...,0.177741,-0.024205,0.008187,-0.031098,0.004662,-0.046214,-0.031263,-0.004929,-0.008755,-0.019156
2012-01-30,0.164229,0.027345,,0.006393,-0.000364,0.003067,0.073287,0.037414,-0.040157,0.047021,...,0.078275,0.005668,0.008642,0.046977,0.057494,0.016789,0.034206,0.02151,0.039747,-0.010659


In [90]:
#Same for mid and small-caps
#Log return:
mid_log_ret = np.log(mid_panel_data) - np.log(mid_panel_data.shift(1))
mid_log_ret.to_csv(os.path.join(PATH,'Tables','mid_log_ret.csv')) #Store in csv format in the 'Tables' folder
small_log_ret = np.log(small_panel_data) - np.log(small_panel_data.shift(1))
small_log_ret.to_csv(os.path.join(PATH,'Tables','small_log_ret.csv')) #Store in csv format in the 'Tables' folder

#Percentage change:
#mid_pct_change = mid_panel_data.pct_change()
#mid_pct_changet.to_csv(os.path.join(PATH,'Tables','mid_pct_change.csv'))
#small_pct_change = small_panel_data.pct_change()
#small_pct_changet.to_csv(os.path.join(PATH,'Tables','small_pct_change.csv'))