## Labeling Sentiment of Annual Filings - Annual Holding Period Return Method

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
import time
import os
import re
import os
import nltk
from nltk.corpus import stopwords
import string

## Read Datasets to Pandas Dataframe

In [2]:
# read master_filing_df.csv file into a pandas dataframe
master_filing_df = pd.read_csv('master_filing_df.csv')
# display master_filing_df
display(master_filing_df.head())

Unnamed: 0,CIK,Ticker_Symbol,Company_Name,Data_Date,Filing_Date,Filing_Type,MDA_Placement,File_Name
0,1750,AIR,AAR CORP,2012-05-31,2012-07-19,10-K,10-K,AIR_2012-07-19.txt
1,1750,AIR,AAR CORP,2013-05-31,2013-07-26,10-K,10-K,AIR_2013-07-26.txt
2,1750,AIR,AAR CORP,2014-05-31,2014-07-17,10-K,10-K,AIR_2014-07-17.txt
3,1750,AIR,AAR CORP,2016-05-31,2016-07-13,10-K,10-K,AIR_2016-07-13.txt
4,1750,AIR,AAR CORP,2019-05-31,2019-07-18,10-K,10-K,AIR_2019-07-18.txt


In [3]:
# read stock_price_df_copy.csv file into a dataframe
stock_price_df = pd.read_csv('stock_price_df_copy.csv')
# display stock_price_df
display(stock_price_df)

Unnamed: 0,Ticker_Symbol,Company_Name,Data_Date,BM_Quintile,F_SCORE,Purchase_Date,Sell_Date,Stock_Purchase_Price,Stock_Sell_Price,Long_Holding_Period_Return_(%),...,Long_Annualized_Holding_Period_Return_(%),Short_Annualized_Holding_Period_Return_(%),SPDR_Purchase_Price,SPDR_Sell_Price,SPDR_Long_Holding_Period_Return_(%),SPDR_Short_Holding_Period_Return_(%),SPDR_Long_Annualized_Holding_Period_Return_(%),SPDR_Short_Annualized_Holding_Period_Return_(%),Long_Excess_Returns_(%),Short_Excess_Returns_(%)
0,AIR,AAR CORP,2012-05-31,Very High,5,2012-07-23,2013-07-23,13.63,25.22,0.850330,...,0.850330,-0.459556,135.0900,169.14,0.252054,-0.201313,0.252054,-0.201313,0.598276,-0.258243
1,AIR,AAR CORP,2013-05-31,Very High,7,2013-08-01,2014-08-01,24.67,26.85,0.088366,...,0.088366,-0.081192,170.6600,192.50,0.127974,-0.113455,0.127974,-0.113455,-0.039607,0.032263
2,AIR,AAR CORP,2014-05-31,Very High,7,2014-07-21,2015-07-21,27.99,29.32,0.047517,...,0.047517,-0.045362,197.3400,211.75,0.073021,-0.068052,0.073021,-0.068052,-0.025504,0.022690
3,AIR,AAR CORP,2016-05-31,Very High,7,2016-07-18,2017-07-18,24.26,36.31,0.496702,...,0.496702,-0.331865,216.4092,245.66,0.135164,-0.119070,0.135164,-0.119070,0.361538,-0.212794
4,AIR,AAR CORP,2019-05-31,Very High,5,2019-07-18,2019-09-03,41.87,43.13,0.030093,...,0.258921,-0.205669,298.8300,290.74,-0.027072,0.027826,-0.191957,0.237558,0.450878,-0.443227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,XRX,XEROX HOLDINGS CORP,2011-12-31,Very High,6,2012-02-27,2013-02-26,8.37,8.02,-0.041816,...,-0.041816,0.043641,137.1600,150.02,0.093759,-0.085722,0.093759,-0.085722,-0.135575,0.129363
1455,XRX,XEROX HOLDINGS CORP,2012-12-31,Very High,6,2013-02-25,2014-02-25,7.89,10.67,0.352345,...,0.352345,-0.260544,149.0000,184.84,0.240537,-0.193897,0.240537,-0.193897,0.111808,-0.066646
1456,XRX,XEROX HOLDINGS CORP,2013-12-31,Very High,6,2014-02-24,2015-02-24,10.78,13.98,0.296846,...,0.296846,-0.228898,184.9100,211.81,0.145476,-0.127001,0.145476,-0.127001,0.151370,-0.101898
1457,XRX,XEROX HOLDINGS CORP,2015-12-31,Very High,4,2016-02-22,2017-02-21,9.59,7.44,-0.224192,...,-0.224192,0.288978,194.7800,236.49,0.214139,-0.176371,0.214139,-0.176371,-0.438331,0.465350


## Define a Function for Textual Preprocessing

In [4]:
# note: in order to conduct lexicon-based sentiment analysis with the loughran and mcdonald dictionary
#       the following preprocessing steps are required
#       - tokenize given text into words
#       - removal of stop words
#       - removal of punctuation
#       - convert tokens into lower case word
#       - remove empty spaces that are classed as tokens
#       - remove ' that preceeds any single character or ' or ``

# define preprocessing function
def text_preprocessing(text):
    # tokenize text
    tokenized_text = nltk.word_tokenize(text)
    # remove stop words from 'tokenized_text'
    stop_words = set(stopwords.words('english'))
    tokenized_text = [word for word in tokenized_text if word not in stop_words]
    # remove amy punctuation
    tokenized_text = [word for word in tokenized_text if word not in string.punctuation]
    # convert tokens into lower case
    tokenized_text = [word.lower() for word in tokenized_text]
    # filter empty spaces
    tokenized_text = list(filter(None, tokenized_text))
    # remove ' that preceeds any single character or ' or ``
    tokenized_text = [word for word in tokenized_text if re.sub(r'^(\'([a-z]|\')|``)', '', word)]
    return tokenized_text

## Retrieve Length of Text Files and Store in master_filing_df

In [5]:
# note: any file that is under 250 characters will be excluded in this study

# note: change directory path to 'Filing Text Files Folder'
try:
    os.chdir('Filing Text Files')
    print('directory changed')
except:
    print('already in directory!')

# create new column to store the length of each text file
master_filing_df['Text_Length'] = ''

for idx, row in master_filing_df.iterrows():
    # open and read text from text file
    file_name = open(row['File_Name'], 'r')
    file_text = file_name.read()
    file_name.close()
    # preprocess text file
    preprocessed_file_text = text_preprocessing(file_text)
    # calculate length of preprocessed text 
    master_filing_df.loc[idx, 'Text_Length'] = len(preprocessed_file_text)

# filter out text that is less than or equal to 250 characters
master_filing_df = master_filing_df.loc[master_filing_df['Text_Length']>250]
# drop 'Text_Length' column in master_filing_df
master_filing_df = master_filing_df.drop(columns=['Text_Length'], axis=1)

directory changed


## Label Text Files Through Annual Holding Period Returns

In [6]:
# note: the text files with a annual holding period return greater than 0 will be labeled positive
#       and the text files corresponding to annual holding period returns less than or equal to 0 will be labeled negative

# create a new dataframe to store sentiment and returns 
# note: drop nan values from master_filing_df
annual_holding_period_returns_df = master_filing_df.dropna().reset_index(drop=True)

# store F-SCORE
annual_holding_period_returns_df['F_SCORE'] = 0

# create a new column to store long holding period return
annual_holding_period_returns_df['Long_Holding_Period_Return_(%)'] = 0
# create a new column to store short holding period return
annual_holding_period_returns_df['Short_Holding_Period_Return_(%)'] = 0
# create a new column to store annualized long holding period return
annual_holding_period_returns_df['Long_Annualized_Holding_Period_Return_(%)'] = 0
# create a new column to store annualized short holding period return
annual_holding_period_returns_df['Short_Annualized_Holding_Period_Return_(%)'] = 0
# create a new column to store long excess reurns
annual_holding_period_returns_df['Long_Excess_Returns_(%)'] = 0 
# create a new column to store short excess reurns
annual_holding_period_returns_df['Short_Excess_Returns_(%)'] = 0 

# create new column to store the sentiment class for given sentiment score in annual_holding_period_returns_df 
annual_holding_period_returns_df['Sentiment_Class'] = ''

# get returns from stock_price_df
for idx, row in annual_holding_period_returns_df.iterrows():
    for idx_a, row_a in stock_price_df.iterrows():
        if row['Data_Date']==row_a['Data_Date'] and row['Ticker_Symbol']==row_a['Ticker_Symbol']:
            annual_holding_period_returns_df.loc[idx, 'Long_Holding_Period_Return_(%)'] = row_a['Long_Holding_Period_Return_(%)']
            annual_holding_period_returns_df.loc[idx, 'Short_Holding_Period_Return_(%)'] = row_a['Short_Holding_Period_Return_(%)']
            annual_holding_period_returns_df.loc[idx, 'Long_Annualized_Holding_Period_Return_(%)'] = row_a['Long_Annualized_Holding_Period_Return_(%)']
            annual_holding_period_returns_df.loc[idx, 'Short_Annualized_Holding_Period_Return_(%)'] = row_a['Short_Annualized_Holding_Period_Return_(%)']
            annual_holding_period_returns_df.loc[idx, 'Long_Excess_Returns_(%)'] = row_a['Long_Excess_Returns_(%)']
            annual_holding_period_returns_df.loc[idx, 'Short_Excess_Returns_(%)'] = row_a['Short_Excess_Returns_(%)']
            annual_holding_period_returns_df.loc[idx, 'F_SCORE'] = row_a['F_SCORE']
        else:
            continue
            
# drop rows containing nan values
annual_holding_period_returns_df = annual_holding_period_returns_df.dropna().reset_index(drop=True)

# label sentiment of text based on 'Long_Annualized_Holding_Period_Return_(%)' column in annual_holding_period_returns_df
for idx, row in annual_holding_period_returns_df.iterrows():
    if row['Long_Annualized_Holding_Period_Return_(%)']>0:
        annual_holding_period_returns_df.loc[idx, 'Sentiment_Class'] = 'Positive'
    else:
        annual_holding_period_returns_df.loc[idx, 'Sentiment_Class'] = 'Negative'

## Portfolio Analysis and Feasibility Check

In [7]:
# note: various portfolio formations will be analyzed to identify the best combination of stocks
# note: a function will be defined to calculate various statistics of the various portfolios

def portfolio_analysis_statistics(portfolio_df):
    # mean of returns
    average_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].mean()
    average_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].mean()
    average_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].mean()
    average_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].mean()
    average_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].mean()
    average_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].mean()
    # median returns
    median_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].median()
    median_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].median()
    median_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].median()
    median_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].median()
    median_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].median()
    median_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].median()
    # 10th percentile returns
    percentile_10_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].quantile(0.1)
    percentile_10_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].quantile(0.1)
    percentile_10_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].quantile(0.1)
    percentile_10_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].quantile(0.1)
    percentile_10_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].quantile(0.1)
    percentile_10_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].quantile(0.1)
    # 25th percentile returns 
    percentile_25_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].quantile(0.25)
    percentile_25_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].quantile(0.25)
    percentile_25_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].quantile(0.25)
    percentile_25_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].quantile(0.25)
    percentile_25_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].quantile(0.25)
    percentile_25_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].quantile(0.25)
    # 75th percentile returns
    percentile_75_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].quantile(0.75)
    percentile_75_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].quantile(0.75)
    percentile_75_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].quantile(0.75)
    percentile_75_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].quantile(0.75)
    percentile_75_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].quantile(0.75)
    percentile_75_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].quantile(0.75)
    # 90th percentile returns
    percentile_90_long_hpr = portfolio_df['Long_Holding_Period_Return_(%)'].quantile(0.9)
    percentile_90_short_hpr = portfolio_df['Short_Holding_Period_Return_(%)'].quantile(0.9)
    percentile_90_long_annaulized_hpr = portfolio_df['Long_Annualized_Holding_Period_Return_(%)'].quantile(0.9)
    percentile_90_short_annaulized_hpr = portfolio_df['Short_Annualized_Holding_Period_Return_(%)'].quantile(0.9)
    percentile_90_long_excess_returns = portfolio_df['Long_Excess_Returns_(%)'].quantile(0.9)
    percentile_90_short_excess_returns = portfolio_df['Short_Excess_Returns_(%)'].quantile(0.9)

    print('number of observations:', portfolio_df.shape[0])
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('mean returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((average_long_hpr), 3))
    print('short holding period return:', round((average_short_hpr), 3))
    print('long annualized holding period return:', round((average_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((average_short_annaulized_hpr), 3))
    print('long excess return:', round((average_long_excess_returns), 3))
    print('short excess return:', round((average_short_excess_returns), 3))
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('median returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((median_long_hpr), 3))
    print('short holding period return:', round((median_short_hpr), 3))
    print('long annualized holding period return:', round((median_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((median_short_annaulized_hpr), 3))
    print('long excess return:', round((median_long_excess_returns), 3))
    print('short excess return:', round((median_short_excess_returns), 3))
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('10th percentile returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((percentile_10_long_hpr), 3))
    print('short holding period return:', round((percentile_10_short_hpr), 3))
    print('long annualized holding period return:', round((percentile_10_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((percentile_10_short_annaulized_hpr), 3))
    print('long excess return:', round((percentile_10_long_excess_returns), 3))
    print('short excess return:', round((percentile_10_short_excess_returns), 3))
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('25th percentile returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((percentile_25_long_hpr), 3))
    print('short holding period return:', round((percentile_25_short_hpr), 3))
    print('long annualized holding period return:', round((percentile_25_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((percentile_25_short_annaulized_hpr), 3))
    print('long excess return:', round((percentile_25_long_excess_returns), 3))
    print('short excess return:', round((percentile_25_short_excess_returns), 3))
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('75th percentile returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((percentile_75_long_hpr), 3))
    print('short holding period return:', round((percentile_75_short_hpr), 3))
    print('long annualized holding period return:', round((percentile_75_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((percentile_75_short_annaulized_hpr), 3))
    print('long excess return:', round((percentile_75_long_excess_returns), 3))
    print('short excess return:', round((percentile_75_short_excess_returns), 3))
    print('\n')
    print('--------------------------------------------------------------------------------')
    print('90th percentile returns')
    print('--------------------------------------------------------------------------------')
    print('long holding period return:', round((percentile_90_long_hpr), 3))
    print('short holding period return:', round((percentile_90_short_hpr), 3))
    print('long annualized holding period return:', round((percentile_90_long_annaulized_hpr), 3))
    print('short annualized holding period return:', round((percentile_90_short_annaulized_hpr), 3))
    print('long excess return:', round((percentile_90_long_excess_returns), 3))
    print('short excess return:', round((percentile_90_short_excess_returns), 3))
    print('\n')
    return None

In [8]:
# analysis of text files that have been labeled with positive sentiment
print('POSITIVE SENTIMENT PORTFOLIO')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df.loc[annual_holding_period_returns_df['Sentiment_Class']=="Positive"])

POSITIVE SENTIMENT PORTFOLIO


number of observations: 427


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: 0.681
short holding period return: -0.252
long annualized holding period return: 3449.602
short annualized holding period return: -0.274
long excess return: 3449.468
short excess return: -0.16


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: 0.265
short holding period return: -0.21
long annualized holding period return: 0.303
short annualized holding period return: -0.232
long excess return: 0.169
short excess return: -0.12


--------------------------------------------------------------------------------
10th percentile returns
-------------------------------------------

In [9]:
# analysis of text files that have been labeled with negative sentiment
print('NEGATIVE SENTIMENT PORTFOLIO')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df.loc[annual_holding_period_returns_df['Sentiment_Class']=="Negative"])

NEGATIVE SENTIMENT PORTFOLIO


number of observations: 428


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: -0.306
short holding period return: 1.014
long annualized holding period return: -0.361
short annualized holding period return: 2.058
long excess return: -0.439
short excess return: 2.126


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: -0.267
short holding period return: 0.365
long annualized holding period return: -0.313
short annualized holding period return: 0.456
long excess return: -0.399
short excess return: 0.541


--------------------------------------------------------------------------------
10th percentile returns
----------------------------------------------

In [10]:
# analysis of all text files
print('COMPLETE PORTFOLIO')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df)

COMPLETE PORTFOLIO


number of observations: 855


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: 0.187
short holding period return: 0.382
long annualized holding period return: 1722.603
short annualized holding period return: 0.893
long excess return: 1722.497
short excess return: 0.984


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: 0.0
short holding period return: 0.0
long annualized holding period return: 0.0
short annualized holding period return: 0.0
long excess return: -0.106
short excess return: 0.09


--------------------------------------------------------------------------------
10th percentile returns
----------------------------------------------------------------

In [11]:
# analysis of all text files (excluding major outlier in index 78)
print('COMPLETE PORTFOLIO (EXCLUDING MAJOR OUTLIER)')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df.drop(78))

COMPLETE PORTFOLIO (EXCLUDING MAJOR OUTLIER)


number of observations: 854


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: 0.187
short holding period return: 0.383
long annualized holding period return: 0.883
short annualized holding period return: 0.896
long excess return: 0.778
short excess return: 0.986


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: 0.0
short holding period return: 0.0
long annualized holding period return: 0.0
short annualized holding period return: 0.0
long excess return: -0.106
short excess return: 0.091


--------------------------------------------------------------------------------
10th percentile returns
-------------------------------------------

In [28]:
# analysis of text files that have been labeled with positive sentiment and have a high F-SCORE
print('POSITIVE SENTIMENT HIGH F-SCORE PORTFOLIO')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df.loc[(annual_holding_period_returns_df['Sentiment_Class']=="Positive") &
                                                                   ((annual_holding_period_returns_df['F_SCORE']==8) |
                                                                    (annual_holding_period_returns_df['F_SCORE']==9))])

POSITIVE SENTIMENT HIGH F-SCORE PORTFOLIO


number of observations: 20


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: 0.289
short holding period return: -0.208
long annualized holding period return: 0.303
short annualized holding period return: -0.218
long excess return: 0.179
short excess return: -0.109


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: 0.252
short holding period return: -0.202
long annualized holding period return: 0.281
short annualized holding period return: -0.219
long excess return: 0.145
short excess return: -0.089


--------------------------------------------------------------------------------
10th percentile returns
----------------------------------

In [29]:
# analysis of text files that have been labeled with negative sentiment and have a low F-SCORE
print('NEGATIVE SENTIMENT LOW F-SCORE PORTFOLIO')
print('\n')
portfolio_analysis_statistics(annual_holding_period_returns_df.loc[(annual_holding_period_returns_df['Sentiment_Class']=="Negative") &
                                                                   ((annual_holding_period_returns_df['F_SCORE']==0) |
                                                                    (annual_holding_period_returns_df['F_SCORE']==1))])

NEGATIVE SENTIMENT LOW F-SCORE PORTFOLIO


number of observations: 10


--------------------------------------------------------------------------------
mean returns
--------------------------------------------------------------------------------
long holding period return: -0.203
short holding period return: 0.603
long annualized holding period return: -0.272
short annualized holding period return: 0.842
long excess return: -0.312
short excess return: 0.878


--------------------------------------------------------------------------------
median returns
--------------------------------------------------------------------------------
long holding period return: -0.078
short holding period return: 0.086
long annualized holding period return: -0.214
short annualized holding period return: 0.287
long excess return: -0.269
short excess return: 0.337


--------------------------------------------------------------------------------
10th percentile returns
-----------------------------------

In [12]:
# analysis of going long on text files with positive sentiment and going short on text files with negative sentiment
for i in range(2):
    if i==0:
        positive_negative_portfolio_df = annual_holding_period_returns_df

        # define variables to store returns
        hpr = 0
        annualized_hpr = 0
        excess_return = 0

        for idx, row in positive_negative_portfolio_df.iterrows():
            if row['Sentiment_Class']=='Negative':
                hpr = hpr + row['Short_Holding_Period_Return_(%)']
                annualized_hpr = annualized_hpr + row['Short_Annualized_Holding_Period_Return_(%)']
                excess_return = excess_return + row['Short_Excess_Returns_(%)']        
            else:
                hpr = hpr + row['Long_Holding_Period_Return_(%)']
                annualized_hpr = annualized_hpr + row['Long_Annualized_Holding_Period_Return_(%)']
                excess_return = excess_return + row['Long_Excess_Returns_(%)']    

        # calculate averages of returns
        average_hpr = hpr / positive_negative_portfolio_df.shape[0]
        average_annualized_hpr = annualized_hpr / positive_negative_portfolio_df.shape[0]
        average_excess_return = excess_return / positive_negative_portfolio_df.shape[0]

        print('LONG POSITIVE SENTIMENT TEXT AND SHORT NEGATIVE SENTIMENT TEXT')
        print('\n')
        print('---------------------------------------------------------------------------------------')
        print('average holding period return:', round(average_hpr, 3))
        print('average annaulized holding period return:', round(average_annualized_hpr, 3))
        print('average excess return:', round(average_excess_return, 3))
        print('\n')
    else:
        # drop outlier
        positive_negative_portfolio_df = annual_holding_period_returns_df.drop(78)

        # define variables to store returns
        hpr = 0
        annualized_hpr = 0
        excess_return = 0

        for idx, row in positive_negative_portfolio_df.iterrows():
            if row['Sentiment_Class']=='Negative':
                hpr = hpr + row['Short_Holding_Period_Return_(%)']
                annualized_hpr = annualized_hpr + row['Short_Annualized_Holding_Period_Return_(%)']
                excess_return = excess_return + row['Short_Excess_Returns_(%)']        
            else:
                hpr = hpr + row['Long_Holding_Period_Return_(%)']
                annualized_hpr = annualized_hpr + row['Long_Annualized_Holding_Period_Return_(%)']
                excess_return = excess_return + row['Long_Excess_Returns_(%)']    

        # calculate averages of returns
        average_hpr = hpr / positive_negative_portfolio_df.shape[0]
        average_annualized_hpr = annualized_hpr / positive_negative_portfolio_df.shape[0]
        average_excess_return = excess_return / positive_negative_portfolio_df.shape[0]

        print('LONG POSITIVE SENTIMENT TEXT AND SHORT NEGATIVE SENTIMENT TEXT (EXCLUDING MAJOR OUTLIER)')
        print('\n')
        print('---------------------------------------------------------------------------------------')
        print('average holding period return:', round(average_hpr, 3))
        print('average annaulized holding period return:', round(average_annualized_hpr, 3))
        print('average excess return: ', round(average_excess_return, 3))

LONG POSITIVE SENTIMENT TEXT AND SHORT NEGATIVE SENTIMENT TEXT


---------------------------------------------------------------------------------------
average holding period return: 0.848
average annaulized holding period return: 1723.814
average excess return: 1723.781


LONG POSITIVE SENTIMENT TEXT AND SHORT NEGATIVE SENTIMENT TEXT (EXCLUDING MAJOR OUTLIER)


---------------------------------------------------------------------------------------
average holding period return: 0.848
average annaulized holding period return: 2.095
average excess return:  2.063


## Save annual_holding_period_returns_df to CSV File

In [13]:
# change path back to prototype two directory
os.chdir('..')

# save annual_holding_period_returns_df to csv file
annual_holding_period_returns_df.to_csv('annual_holding_period_returns_df.csv', index=False)