In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl

from pandas_datareader import data as pdr

import datetime as dt
import yfinance as yf

from textblob import TextBlob

import spacy

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

import warnings

import csv

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\fokta\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
np.set_printoptions(precision = 3)

pd.set_option("display.float_format", lambda x: "%.3f" % x)

plt.style.use("ggplot")

mpl.rcParams["axes.grid"] = True
mpl.rcParams["grid.color"] = "grey"
mpl.rcParams["grid.alpha"] = 0.25

mpl.rcParams["axes.facecolor"] = "white"

mpl.rcParams["legend.fontsize"] = 14

In [3]:
list_of_companies = pd.read_csv("list_of_companies.csv")
list_of_companies = list_of_companies.drop(["Unnamed: 0"], axis = 1)
list_of_companies["month"] = pd.to_datetime(list_of_companies["dates"]).dt.month

In [4]:
spx_500 = pd.read_csv("SPX_500.csv")
spx_500_companies = spx_500.drop(["Unnamed: 0"], axis = 1)
spx_500_companies["month"] = pd.to_datetime(spx_500_companies["dates"]).dt.month

In [5]:
companies_scores_tf = pd.read_csv("firm_scores_TF.csv")
companies_scores_tf = companies_scores_tf.drop(["time"], axis = 1)
companies_scores_tf["year"] = list_of_companies["year"]
companies_scores_tf["month"] = list_of_companies["month"]

In [6]:
companies_scores_tf.insert(0, "tickers", list_of_companies["tickers"])

In [7]:
spx_companies_scores = companies_scores_tf[companies_scores_tf["firm_id"].isin(spx_500_companies["company_names"])]

In [8]:
spx_yearly_scores = (spx_companies_scores.groupby(["tickers", "year"]).mean()).drop(["document_length", "month"], axis = 1).sum(axis = 1)

In [9]:
ticker_dataframe = pd.DataFrame()

def yearly_data(data):
    for i in data.columns:
        ticker_dataframe["top_{}".format(i)] = data.sort_values(by = i, ascending = False)[i].dropna().iloc[ :30].index.to_list()
        ticker_dataframe["bot_{}".format(i)] = data.sort_values(by = i, ascending = False)[i].dropna().iloc[-30: ].index.to_list()
    return ticker_dataframe

In [10]:
list_of_tickers = yearly_data(spx_yearly_scores.unstack())

In [11]:
market_sentiment = pd.read_csv("pca_pls.csv")
market_sentiment["date"] = pd.to_datetime(market_sentiment["Unnamed: 0"], format = "%Y%m")
market_sentiment = market_sentiment.drop(["Unnamed: 0"], axis = 1)

In [12]:
positive_news = pd.read_csv(r"Pos_News.csv", sep = "|")
positive_news.dropna(inplace = True)

In [13]:
spx_sentiment = positive_news[positive_news["ticker"].isin(spx_500_companies["tickers"])]
spx_sentiment["year"] = pd.to_datetime(spx_sentiment['date']).dt.year
spx_sentiment["month"] = pd.to_datetime(spx_sentiment['date']).dt.month
spx_sentiment_higher_than_70 = spx_sentiment[spx_sentiment["sent_lex"] > 0.7]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spx_sentiment["year"] = pd.to_datetime(spx_sentiment['date']).dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spx_sentiment["month"] = pd.to_datetime(spx_sentiment['date']).dt.month


In [14]:
new_date = []

for i in spx_sentiment_higher_than_70["date"]:
    new_date.append(i[ :-3])

spx_sentiment_higher_than_70["dummy_date"] = pd.to_datetime(new_date, format = "%Y-%m")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spx_sentiment_higher_than_70["dummy_date"] = pd.to_datetime(new_date, format = "%Y-%m")


In [15]:
merged_data = pd.merge(spx_sentiment_higher_than_70, 
                  market_sentiment,
                 left_on = 'dummy_date',   
                 right_on = 'date')
merged_data = merged_data.drop(["date_y","dummy_date"], axis = 1)
merged_data.rename(columns = {"date_x" : "date"}, inplace = True)
merged_data = merged_data[merged_data["pls"] > 0]

In [16]:
dictionary = {}

for tickers in list_of_tickers.columns[ :-2]:
    ticker_filtering = merged_data[merged_data["ticker"].isin(list_of_tickers[tickers])]
    year_filter = ticker_filtering[ticker_filtering["year"] < (int(tickers[-4:]) + 2)]
    year_filter_2 = year_filter[year_filter["year"] > int(tickers[-4:])]
    dictionary[tickers] = pd.DataFrame(year_filter_2)

# Determining holding period

In [17]:
pct_change_in_event_return = pd.DataFrame(spx_sentiment_higher_than_70.iloc[ : , 6:9].T.diff().mean(axis =1))*100
pct_change_in_event_return.columns = ["pct_change_in_event_return"]

# Top 30 firms

In [18]:
def getting_data(tickers, start, end): 
    df = yf.download(tickers.to_list(),
                  start = start,
                   end = end)
    return df

In [19]:
dictionary_2 = {}

for tickers in list_of_tickers.columns[ :-2]:
    data = getting_data(list_of_tickers[tickers],dt.datetime(2015,1,1),dt.datetime(2019,12,29))["Close"]
    dictionary_2[tickers] = pd.DataFrame(data)

[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed
[*********************100%***********************]  30 of 30 completed


In [20]:
dictionary_3 = {}

for name, data in dictionary_2.items():
    dictionary_2[name].index = dictionary_2[name].index.strftime("%Y-%m-%d")
    dataframe = dictionary_2[name].T[dictionary_2[name].T.index.isin(dictionary_2[name].T.index[dictionary_2[name].T.index.isin(dictionary[name]["ticker"])])]
    dictionary_3[name] = pd.DataFrame(dataframe)

In [21]:
def event_return(dataframe, shift_value, main_dataframe,groupby_dataframe):
    returns = np.log(dataframe / dataframe.shift(1))
    returns_next_day = returns.shift(shift_value)
    first_filtering = returns_next_day[returns_next_day.index.isin(main_dataframe)]
    second_filtering = first_filtering.unstack()[first_filtering.unstack().index.isin(groupby_dataframe)]
    making_list = (second_filtering.sort_index().values.tolist())
    return making_list

In [22]:
shift_value =  [-1,-4]
empty_list = []

for i in shift_value:
    for name, data in dictionary_2.items():
        returns = event_return(dictionary_3[name].T, i, dictionary[name]["date"], dictionary[name].groupby(["ticker","date"]).sum().index)
        empty_list.append(returns)

In [23]:
top_30_returns = pd.concat([pd.Series(empty_list[0]) * 1 + pd.Series(empty_list[8]) * -1,\
                            pd.Series(empty_list[2]) * 1 + pd.Series(empty_list[10]) * -1,\
                            pd.Series(empty_list[4]) * 1 + pd.Series(empty_list[12]) * -1,\
                            pd.Series(empty_list[6]) * 1 + pd.Series(empty_list[14]) * -1], axis = 1)
top_30_returns.columns = ["2016","2017","2018","2019"]

In [24]:
def sharpe(data):
    sharpe_ratio = (data.mean()/data.std()) *252**0.5
    return sharpe_ratio

In [25]:
cumulative_returns = []
sharpe_ratios = []

for i in top_30_returns.columns:
    cumulative_return = top_30_returns.cumsum().apply(np.exp)[i].dropna().iloc[-1] - 1
    sharpe_ratio = sharpe(top_30_returns[i])
    cumulative_returns.append(cumulative_return)
    sharpe_ratios.append(sharpe_ratio)

In [26]:
bottom_30_returns = pd.concat([pd.Series(empty_list[1]) * 1 + pd.Series(empty_list[9]) * -1,\
                            pd.Series(empty_list[3]) * 1 + pd.Series(empty_list[11]) * -1,\
                            pd.Series(empty_list[5]) * 1 + pd.Series(empty_list[13]) * -1,\
                            pd.Series(empty_list[7]) * 1 + pd.Series(empty_list[15]) * -1], axis = 1)
bottom_30_returns.columns = ["2016","2017","2018","2019"]

In [27]:
cumulative_returns_bottom = []
sharpe_ratios_bottom = []

for i in bottom_30_returns.columns:
    cumulative_return = bottom_30_returns.cumsum().apply(np.exp)[i].dropna().iloc[-1] - 1
    sharpe_ratio = sharpe(bottom_30_returns[i])
    cumulative_returns_bottom.append(cumulative_return)
    sharpe_ratios_bottom.append(sharpe_ratio)

In [28]:
final_table = pd.concat([pd.Series(cumulative_returns), pd.Series(sharpe_ratios), pd.Series(cumulative_returns_bottom), pd.Series(sharpe_ratios_bottom)], axis =1).T
final_table.index = ["top_30_cumulative_return", "top_30_sharpe_ratio", "bottom_30_cumulative_return", "bottom_30_sharpe_ratio" ]
final_table.columns = ["2016", "2017", "2018", "2019"]
final_table

Unnamed: 0,2016,2017,2018,2019
top_30_cumulative_return,-0.003,0.198,-0.018,0.127
top_30_sharpe_ratio,-0.033,1.479,-0.117,0.682
bottom_30_cumulative_return,-0.128,-0.121,-0.228,-0.011
bottom_30_sharpe_ratio,-2.493,-2.484,-8.221,-0.13
