In [43]:
import pandas as pd
from pathlib import Path
import yahoo_fin.stock_info as si
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Read in News Dataset

In [44]:
data_path = Path('/Users/hasnainraza/FinTech/data_for_project2/us_equities_news_dataset.csv')

df = pd.read_csv(data_path)
df.set_index(df.release_date, inplace=True)

In [45]:
df.drop(columns = ['release_date'], axis=1, inplace=True)

In [46]:
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,id,ticker,title,category,content,provider,url,article_id
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-10-02,227750,KYOCY,Nikkei down 1 4 pct on economy fears autos drop,news,Nikkei down 1 4 pct hits lowest point in ove...,Reuters,https://www.investing.com/news/forex-news/nikk...,669
2008-10-03,327113,TGT,FOREX Dollar poised for biggest weekly gain in...,news,Money market squeeze ECB shift fuel dollar ...,Reuters,https://www.investing.com/news/forex-news/fore...,671
2008-10-03,435744,C,GLOBAL MARKETS Stocks rally on Wells Wachovia ...,news,U S stocks rally dollar gains on European w...,Reuters,https://www.investing.com/news/forex-news/glob...,682
2008-10-03,441802,WFC,GLOBAL MARKETS Stocks hold up after Wells Wach...,news,Europe stocks U S stock futures up on Well...,Reuters,https://www.investing.com/news/forex-news/glob...,673
2008-10-03,441803,WFC,GLOBAL MARKETS U S bailout plan passes inves...,news,U S House of Representatives approves bailou...,Reuters,https://www.investing.com/news/forex-news/glob...,686


In [47]:
# Create Copy of dataframe to manipulate
df2 = df.copy()
df2 = df2.loc['2016-01-01':]

In [48]:
df_ticker_list = df2['ticker'].unique()
sp500_tickers = si.tickers_sp500()

In [49]:
# Gets list of S&P 500 tickers found in news dataset
df_sp500_list = [x for x in sp500_tickers if x in df_ticker_list]

In [51]:
# Gets list of S&P 500 tickers NOT found in news dataset
missing_tickers = [x for x in sp500_tickers if x not in df_sp500_list]

In [53]:
# Add Tesla to list of tickers
df_sp500_list.append('TSLA')

In [55]:
# Set Dataframe such that only rows containing S&P 500 tickers and Tesla are found
df2 = df2.loc[df2['ticker'].isin(df_sp500_list)]

In [65]:
# Create a new dataframe with just the headlines and ticker
news_df = df2[['ticker', 'title']]

In [66]:
def vader_sentiment(df, column): 
    # Create vader Sentiment analyzer
    vader = SentimentIntensityAnalyzer()
    # Lambda function to run vader sentiment on title column
    f = lambda title: vader.polarity_scores(title)['compound']
    # Apply sentiment to title column and put values in a new column named title_sentiment
    df['compound_sentiment'] = df[column].apply(f)
    return df

In [67]:
news_df = vader_sentiment(news_df, 'title')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['compound_sentiment'] = df[column].apply(f)


In [68]:
# Sort dataframe rows by ticker and release date
news_df.sort_values(by=['ticker', 'release_date'], ascending=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df.sort_values(by=['ticker', 'release_date'], ascending=True, inplace=True)


In [69]:
news_df.head()

Unnamed: 0_level_0,ticker,title,compound_sentiment
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-02,A,Can Fiserv FISV Pull Off An Earnings Surpris...,0.2732
2016-05-03,A,US Cellular USM Q1 Earnings Can The Stock T...,0.2732
2016-05-03,A,What Awaits Telephone Data Systems TDS Q1 ...,0.0772
2016-05-03,A,Liberty Interactive QVCA Poised To Beat On Q...,0.6597
2016-05-04,A,AppFolio APPF Q1 Earnings What To Expect Th...,0.0


In [70]:
news_df.describe()

Unnamed: 0,compound_sentiment
count,144657.0
mean,0.066251
std,0.310186
min,-0.9422
25%,0.0
50%,0.0
75%,0.2732
max,0.9337


In [71]:
# Create csv out of news_df table
filepath = Path('article_data/news_sentiment.csv')
news_df.to_csv(filepath)

In [75]:
news_df.drop(columns = ['title'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df.drop(columns = ['title'], axis=1, inplace=True)


In [80]:
news_df = news_df.groupby(['release_date', 'ticker'])[['compound_sentiment']].mean()

In [81]:
filepath2 = Path('article_data/mean_sentiment.csv')
news_df.to_csv(filepath2)

In [108]:
main_path = Path().resolve().parents[0]
data_folder_path = main_path / 'data'
mean_sentiment_csv_path = data_folder_path / 'mean_news_sentiment.csv'

In [109]:
news_df.to_csv(mean_sentiment_csv_path)