In [3]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
from wordcloud import WordCloud
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient
load_dotenv()
import alpaca_trade_api as tradeapi

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bfode\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Read your api key environment variable
# YOUR CODE HERE!
api_key = os.getenv("news_api")

In [5]:
# Create a newsapi client
# YOUR CODE HERE!
newsapi = NewsApiClient(api_key=api_key)

In [6]:
# Fetch the Bitcoin news articles
# YOUR CODE HERE!
WallStreet_Bets = newsapi.get_everything(q="Goldman Sachs Group Inc and tweets")
WallStreet_Bets["totalResults"]


# Print total articles
print(f"Total WallStreet Bets news articles: {WallStreet_Bets['totalResults']}")

WallStreet_Bets["articles"][4]

Total WallStreet Bets news articles: 15


{'source': {'id': None, 'name': 'Yahoo Entertainment'},
 'author': 'Justina Lee',
 'title': 'Ex-Wall Street Quants Net 78% Return in Crypto Options Boom',
 'description': '(Bloomberg) -- Forget Elon Musk tweets, regulatory missives and Bitcoin’s energy consumption.To hedge fund manager Shiliang Tang, the biggest crypto story...',
 'url': 'https://finance.yahoo.com/news/130-million-crypto-quant-nets-115954100.html',
 'urlToImage': 'https://s.yimg.com/uu/api/res/1.2/AsahwH_sro1eeyYI4MDnmw--~B/aD03NjQ7dz0xMjk2O2FwcGlkPXl0YWNoeW9u/https://media.zenfs.com/en/bloomberg_markets_842/a9a1d743fbe30d42acce56a9582f74e3',
 'publishedAt': '2021-06-21T21:03:20Z',
 'content': '(Bloomberg) --\r\nForget Elon Musk tweets, regulatory missives and Bitcoins energy consumption.\r\nTo hedge fund manager Shiliang Tang, the biggest crypto story this year is taking place in the shadows: … [+4663 chars]'}

In [7]:
# Create the  sentiment scores DataFrame
# YOUR CODE HERE!
WallStreet_Bets_sentiments = []

for article in WallStreet_Bets["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        WallStreet_Bets_sentiments.append({
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu,
            "text": text,
            
        })
        
    except AttributeError:
        pass
    

WallStreet_Bets_df = pd.DataFrame(WallStreet_Bets_sentiments)
WallStreet_Bets_df.head()

Unnamed: 0,compound,positive,negative,neutral,text
0,0.4588,0.109,0.057,0.833,People were already worried about equities six...
1,0.0,0.0,0.0,1.0,"HONG KONG (Reuters) -Some global banks, funds ..."
2,-0.3818,0.044,0.086,0.871,Gold will surge to fresh highs in the next yea...
3,0.1027,0.066,0.057,0.877,AMC\r\nThe \r\nfamous George Soros quote is Wh...
4,0.0516,0.062,0.056,0.882,"(Bloomberg) --\r\nForget Elon Musk tweets, reg..."


In [8]:
# Describe the Bitcoin Sentiment
# YOUR CODE HERE!
WallStreet_Bets_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,15.0,15.0,15.0,15.0
mean,-0.058373,0.054533,0.063067,0.8824
std,0.29497,0.030303,0.022645,0.040203
min,-0.3818,0.0,0.0,0.833
25%,-0.3818,0.044,0.056,0.869
50%,0.0516,0.06,0.057,0.877
75%,0.07715,0.064,0.086,0.882
max,0.4588,0.109,0.088,1.0


In [9]:
### Natural Language Processing

In [10]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [11]:
# Instantiate the lemmatizer
# YOUR CODE HERE!
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords
# YOUR CODE HERE!
stopWords = set(stopwords.words('english'))
print(len(stopWords))
print(stopWords)

# Expand the default stopwords list if necessary
# YOUR CODE HERE!

179
{'then', 'where', 'yourself', 'as', 'into', 'mightn', 'shan', 'once', 'he', 'have', 'that', 'off', 's', "shouldn't", 'doing', 'hers', 'needn', "needn't", 'which', 'them', 'what', 'now', 'd', 'hadn', "she's", 'through', 'how', "shan't", 'she', 'with', 'itself', 'should', 'having', 'an', 'll', 'couldn', 'theirs', 'hasn', 'when', 'nor', "don't", 'so', "you've", 'own', 'ourselves', "didn't", 'if', 'we', 'in', 'yourselves', 'be', 'being', 't', 'up', 'over', 'there', 're', "aren't", 'on', "wasn't", 'aren', 'down', 've', 'than', 'between', 'me', 'm', 'doesn', 'these', 'were', 'further', 'am', 'during', 'did', 'yours', 'of', 'more', 'your', 'very', 'myself', "that'll", 'both', 'him', 'this', "it's", 'because', 'his', 'o', 'each', 'who', 'themselves', 'does', 'some', 'i', 'why', 'until', 'ain', 'won', 'the', 'can', 'herself', 'all', 'been', "you'll", "haven't", 'above', 'again', 'wouldn', 'such', 'my', "isn't", 'under', 'and', 'its', 'most', "doesn't", 'no', 'don', 'their', 'any', 'is', 'on

In [12]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
   
    # Create a tokenized list of the words
    words = word_tokenize(re_clean)
    
    # Lemmatize words into root words
    
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Convert the words to lowercase
    # Remove the stop words
    output = [word.lower() for word in lem if word.lower() not in stopWords]
    
    return output
    

In [13]:
# Create a new tokens column for WallStreet_Bets
# YOUR CODE HERE!
WallStreet_Bets_df['tokens'] = WallStreet_Bets_df.text.apply(tokenizer)
WallStreet_Bets_df.head()

Unnamed: 0,compound,positive,negative,neutral,text,tokens
0,0.4588,0.109,0.057,0.833,People were already worried about equities six...,"[people, already, worried, equity, six, month,..."
1,0.0,0.0,0.0,1.0,"HONG KONG (Reuters) -Some global banks, funds ...","[hong, kong, reuters, global, bank, fund, fina..."
2,-0.3818,0.044,0.086,0.871,Gold will surge to fresh highs in the next yea...,"[gold, surge, fresh, high, next, year, investo..."
3,0.1027,0.066,0.057,0.877,AMC\r\nThe \r\nfamous George Soros quote is Wh...,"[amcthe, famous, george, soros, quote, see, bu..."
4,0.0516,0.062,0.056,0.882,"(Bloomberg) --\r\nForget Elon Musk tweets, reg...","[bloomberg, forget, elon, musk, tweet, regulat..."


In [14]:
# Set Alpaca API key and secret
# YOUR CODE HERE!
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
# YOUR CODE HERE!
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

In [15]:
# Format current date as ISO format
# YOUR CODE HERE!

start_date = pd.Timestamp('2015-08-07', tz='America/New_York').isoformat()
end_date = pd.Timestamp('2020-08-07', tz='America/New_York').isoformat()
#today = pd.Timestamp("2021-04-12", tz="America/New_York").isoformat()

# Set the tickers
tickers = ["WFC", "GS","MS"]

# Set timeframe to '1D' for Alpaca API
timeframe = "1D"

# Get current closing prices for SPY and AGG
# YOUR CODE HERE!
df_portfolio = alpaca.get_barset(
    tickers,
    timeframe,
    start = start_date,
    end = end_date
).df

# Preview DataFrame
# YOUR CODE HERE!
df_portfolio

Unnamed: 0_level_0,GS,GS,GS,GS,GS,MS,MS,MS,MS,MS,WFC,WFC,WFC,WFC,WFC
Unnamed: 0_level_1,open,high,low,close,volume,open,high,low,close,volume,open,high,low,close,volume
2020-03-18 00:00:00-04:00,150.00,155.9700,135.410,140.18,7229403,31.34,32.8400,27.20,30.75,21733841,27.70,28.89,26.9000,28.14,39482878
2020-03-19 00:00:00-04:00,137.38,152.4900,130.850,149.48,6033664,30.35,32.9000,28.74,30.93,20873167,27.66,29.11,26.2875,28.29,49865465
2020-03-20 00:00:00-04:00,148.99,149.1900,138.290,138.41,7490015,31.25,31.4500,29.19,29.66,25426071,28.68,28.75,26.1800,26.47,57765815
2020-03-23 00:00:00-04:00,136.03,141.9350,133.260,134.97,4574057,29.15,29.7899,27.76,27.82,23517468,26.26,26.87,25.1050,25.23,40784032
2020-03-24 00:00:00-04:00,144.32,153.9000,143.630,153.39,5692174,30.09,33.3750,29.40,33.21,23506795,26.96,29.17,26.3300,28.91,41672870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-03 00:00:00-04:00,198.49,199.8800,196.530,199.39,4222375,49.22,49.9050,48.72,49.47,11369869,24.25,24.50,23.9000,24.30,52779706
2020-08-04 00:00:00-04:00,199.35,201.7200,198.475,201.65,3688146,49.50,49.7700,49.08,49.26,11431220,24.20,24.29,24.0300,24.22,53890753
2020-08-05 00:00:00-04:00,203.17,204.8075,203.010,204.48,4088479,49.59,49.9600,49.44,49.78,11894233,24.40,24.75,24.3400,24.39,60841700
2020-08-06 00:00:00-04:00,205.00,206.2900,203.830,204.25,3316362,49.83,49.9800,49.24,49.46,10118777,24.20,24.50,24.1200,24.25,75782256


In [16]:
# Format current date as ISO format
# YOUR CODE HERE!

start_date = pd.Timestamp('2015-08-07', tz='America/New_York').isoformat()
end_date = pd.Timestamp('2020-08-07', tz='America/New_York').isoformat()

In [45]:
# Get 5 years' worth of historical data for SPY and AGG
#Wells Fargo Co. (WFC), Goldman Sachs Group Inc. (GS), and Morgan Stanley (MS).

tickers = ["WFC", "GS","MS"]

df_stock_data = alpaca.get_barset(
    tickers,
    timeframe,
    start=start_date,
    end=end_date,
    limit=1000,
).df

# Display sample data
df_stock_data.head()

Unnamed: 0_level_0,GS,GS,GS,GS,GS,MS,MS,MS,MS,MS,WFC,WFC,WFC,WFC,WFC
Unnamed: 0_level_1,open,high,low,close,volume,open,high,low,close,volume,open,high,low,close,volume
2016-08-18 00:00:00-04:00,165.34,166.56,164.63,166.0401,1343523,30.27,30.74,30.195,30.54,11530304,48.5,48.66,48.33,48.52,9905086
2016-08-19 00:00:00-04:00,165.36,166.9,164.5,166.18,1370721,30.4,30.715,30.26,30.55,8158949,48.44,48.73,48.2,48.64,11208944
2016-08-22 00:00:00-04:00,165.87,166.91,164.89,166.26,1074287,30.44,30.7,30.39,30.6,5601966,48.64,48.74,48.5,48.65,7631001
2016-08-23 00:00:00-04:00,167.0,167.85,165.98,166.01,1578192,30.75,30.97,30.67,30.72,7331515,48.64,48.75,48.39,48.41,13706691
2016-08-24 00:00:00-04:00,165.78,166.7,164.85,165.34,1575529,30.81,31.16,30.72,30.92,11429566,48.5,48.77,48.35,48.64,14356901


In [47]:
df_stock_data

Unnamed: 0_level_0,GS,GS,GS,GS,GS,MS,MS,MS,MS,MS,WFC,WFC,WFC,WFC,WFC
Unnamed: 0_level_1,open,high,low,close,volume,open,high,low,close,volume,open,high,low,close,volume
2016-08-18 00:00:00-04:00,165.34,166.5600,164.630,166.0401,1343523,30.27,30.740,30.195,30.54,11530304,48.50,48.66,48.33,48.52,9905086
2016-08-19 00:00:00-04:00,165.36,166.9000,164.500,166.1800,1370721,30.40,30.715,30.260,30.55,8158949,48.44,48.73,48.20,48.64,11208944
2016-08-22 00:00:00-04:00,165.87,166.9100,164.890,166.2600,1074287,30.44,30.700,30.390,30.60,5601966,48.64,48.74,48.50,48.65,7631001
2016-08-23 00:00:00-04:00,167.00,167.8500,165.980,166.0100,1578192,30.75,30.970,30.670,30.72,7331515,48.64,48.75,48.39,48.41,13706691
2016-08-24 00:00:00-04:00,165.78,166.7000,164.850,165.3400,1575529,30.81,31.160,30.720,30.92,11429566,48.50,48.77,48.35,48.64,14356901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-03 00:00:00-04:00,198.49,199.8800,196.530,199.3900,3200674,49.22,49.905,48.720,49.47,8618979,24.25,24.50,23.90,24.30,42432609
2020-08-04 00:00:00-04:00,199.35,201.7200,198.475,201.6500,3688146,49.50,49.770,49.080,49.26,11431220,24.20,24.29,24.03,24.22,53890753
2020-08-05 00:00:00-04:00,203.17,204.8075,203.010,204.4800,4081429,49.59,49.960,49.440,49.78,11876099,24.40,24.75,24.34,24.39,60804684
2020-08-06 00:00:00-04:00,205.00,206.2900,203.830,204.2500,3310894,49.83,49.980,49.240,49.46,10097493,24.20,24.50,24.12,24.25,75647696


In [18]:
closing_price_gs = pd.DataFrame(df_stock_data["GS"]["close"])
closing_price_ms = pd.DataFrame(df_stock_data["MS"]["close"])
closing_price_wfc = pd.DataFrame(df_stock_data["WFC"]["close"])
all_closing_price = pd.concat([closing_price_gs, closing_price_ms, closing_price_wfc], axis="columns", join="inner")
all_closing_price.index = all_closing_price.index.date
all_closing_price.columns = ["GS","MS","WFC"]
all_closing_price

Unnamed: 0,GS,MS,WFC
2016-08-18,166.0401,30.54,48.52
2016-08-19,166.1800,30.55,48.64
2016-08-22,166.2600,30.60,48.65
2016-08-23,166.0100,30.72,48.41
2016-08-24,165.3400,30.92,48.64
...,...,...,...
2020-08-03,199.3900,49.47,24.30
2020-08-04,201.6500,49.26,24.22
2020-08-05,204.4800,49.78,24.39
2020-08-06,204.2500,49.46,24.25


In [20]:
all_closing_price_returns = all_closing_price.pct_change().dropna()
all_closing_price_returns.head(10)

Unnamed: 0,GS,MS,WFC
2016-08-19,0.000843,0.000327,0.002473
2016-08-22,0.000481,0.001637,0.000206
2016-08-23,-0.001504,0.003922,-0.004933
2016-08-24,-0.004036,0.00651,0.004751
2016-08-25,0.00381,0.001617,-0.004934
2016-08-26,0.000241,0.006458,0.002479
2016-08-29,0.005301,0.0077,0.021641
2016-08-30,0.015399,0.024709,0.021384
2016-08-31,-0.000295,-0.004229,0.00316
2016-09-01,-0.005549,-0.004368,-0.007482


In [29]:
all_closing_price = all_closing_price.pct_change().dropna()
all_closing_price

Unnamed: 0,GS,MS,WFC
2016-08-23,8.619854,-0.650838,26.261231
2016-08-24,-1.408400,-0.527135,-0.921461
2016-08-25,-2.154434,-2.138544,0.038439
2016-08-26,-0.518160,-4.982802,-0.262964
2016-08-29,-23.412301,-0.935762,-6.143738
...,...,...,...
2020-08-03,3.100804,12.022185,0.758035
2020-08-04,-1.325724,-0.892338,1.720202
2020-08-05,-0.612444,1.567893,0.045153
2020-08-06,-5.535039,-0.538551,-0.419611


In [50]:
closing_price_gs = pd.DataFrame(df_stock_data["GS"])
closing_price_gs.columns

Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')

In [32]:
closing_price_gs = pd.DataFrame(df_stock_data["GS"]["close"])
closing_price_gs.columns=["GS"]
closing_price_gs

Unnamed: 0,GS
2016-08-18 00:00:00-04:00,166.0401
2016-08-19 00:00:00-04:00,166.1800
2016-08-22 00:00:00-04:00,166.2600
2016-08-23 00:00:00-04:00,166.0100
2016-08-24 00:00:00-04:00,165.3400
...,...
2020-08-03 00:00:00-04:00,199.3900
2020-08-04 00:00:00-04:00,201.6500
2020-08-05 00:00:00-04:00,204.4800
2020-08-06 00:00:00-04:00,204.2500


In [40]:
closing_price_gs["Returns"] = closing_price_gs.pct_change()* 100
closing_price_gs['Lagged_Return'] = closing_price_gs['Returns'].shift()
closing_price_gs = closing_price_gs.dropna()
closing_price_gs.tail()

Unnamed: 0,GS,Returns,Lagged_Return
2020-08-03 00:00:00-04:00,199.39,0.70202,-0.791662
2020-08-04 00:00:00-04:00,201.65,1.133457,0.70202
2020-08-05 00:00:00-04:00,204.48,1.403422,1.133457
2020-08-06 00:00:00-04:00,204.25,-0.11248,1.403422
2020-08-07 00:00:00-04:00,208.27,1.968176,-0.11248


In [334]:
# Create a series using "Close" price percentage returns, drop any NaNs, and check the results:
# (Make sure to multiply the pct_change() results by *100)
df_stock_data['Return'] = df_stock_data.Close.pct_change() * 100
df_stock_data['Lagged_Return'] = df_stock_data['Return'].shift()
df_stock_data = df_stock_data.dropna()
df_stock_data.tail()

AttributeError: 'DataFrame' object has no attribute 'Close'