In [418]:
# Initial imports
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import nltk as nltk
from wordcloud import WordCloud
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient
load_dotenv()
import alpaca_trade_api as tradeapi
from datetime import datetime, timedelta
import math

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bfode\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [349]:
# Read your api key environment variable
# YOUR CODE HERE!
api_key = os.getenv("news_api")

In [350]:
# Create a newsapi client
# YOUR CODE HERE!
newsapi = NewsApiClient(api_key=api_key)

In [351]:
# Load .env enviroment variables
load_dotenv()

# Set News API Key
newsapi = NewsApiClient(api_key=os.environ["news_api"])

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [381]:
# Get last 30days' worth of historical data for GS 
#Wells Fargo Co. (WFC), Goldman Sachs Group Inc. (GS), and Morgan Stanley (MS).

# Set the ticker
ticker = "GS"

# Set timeframe to '1D'
timeframe = "1D"

# Set current date and the date from one month ago using the ISO format
current_date = pd.Timestamp(datetime.now(), tz="America/New_York").isoformat()
past_date = pd.Timestamp(datetime.now()- timedelta(30), tz="America/New_York").isoformat()

# Get 4 weeks worth of historical data for AAPL
df = api.get_barset(
    ticker,
    timeframe,
    limit=None,
    start=past_date,
    end=current_date,
    after=None,
    until=None,
).df

# Display data
df.tail()

Unnamed: 0_level_0,GS,GS,GS,GS,GS
Unnamed: 0_level_1,open,high,low,close,volume
2021-06-28 00:00:00-04:00,368.03,368.87,363.86,368.595,1491640
2021-06-29 00:00:00-04:00,374.75,378.09,370.7495,372.62,2151889
2021-06-30 00:00:00-04:00,370.9,380.11,370.9,379.445,1823472
2021-07-01 00:00:00-04:00,380.55,381.64,374.145,374.96,2290040
2021-07-02 00:00:00-04:00,376.31,376.6,372.42,374.16,1385037


In [453]:
df.to_csv('../data/gs_stock.csv')

In [382]:
# Drop Outer Table Level
df = df.droplevel(axis=1, level=0)

# Use the drop function to drop extra columns
#df = df.drop(columns=["open", "high", "low", "volume"])

# Since this is daily data, we can keep only the date (remove the time) component of the data
df.index = df.index.date

# Display sample data
df.head()

Unnamed: 0,open,high,low,close,volume
2021-06-07,392.89,393.2603,387.55,388.09,1815955
2021-06-08,385.79,386.4799,382.22,384.8,1946972
2021-06-09,383.44,384.27,378.88,382.78,1817307
2021-06-10,389.08,389.64,372.345,373.75,3302197
2021-06-11,375.47,378.75,375.11,378.23,1692723


In [384]:
df.columns = ['High','Low','Open','Close','Volume']

In [385]:

df['stock_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

In [388]:
scaler = StandardScaler()
df['stock_change_scaled'] = scaler.fit_transform(df[['stock_change']])
df.head()

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled
2021-06-07,392.89,393.2603,387.55,388.09,1815955,0.139337,-1.456861
2021-06-08,385.79,386.4799,382.22,384.8,1946972,0.675004,-0.544761
2021-06-09,383.44,384.27,378.88,382.78,1817307,1.02935,0.058596
2021-06-10,389.08,389.64,372.345,373.75,3302197,0.377338,-1.051608
2021-06-11,375.47,378.75,375.11,378.23,1692723,0.831756,-0.277854


In [449]:
df_returns = df.pct_change().dropna()
df_returns.head()

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled
2021-06-08,-0.018071,-0.017242,-0.013753,-0.008477,0.072148,3.844403,-0.626072
2021-06-09,-0.006091,-0.005718,-0.008738,-0.005249,-0.066598,0.524954,-1.107563
2021-06-10,0.014709,0.013975,-0.017248,-0.023591,0.817083,-0.633421,-18.946674
2021-06-11,-0.03498,-0.027949,0.007426,0.011987,-0.487395,1.204272,-0.735782
2021-06-14,0.00522,0.0,-0.011757,-0.013642,0.1955,-0.231348,1.179215


In [356]:
# Use newsapi client to get most relevant 20 headlines per day in the past month
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=str(date)[:10],
            to=str(date)[:10],
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates

In [357]:
goldman_headlines, dates = get_headlines("Goldman Sachs Group Inc.")

Fetching news about 'Goldman Sachs Group Inc.'
******************************
retrieving news from: 2021-07-05 00:00:00
retrieving news from: 2021-07-04 00:00:00
retrieving news from: 2021-07-03 00:00:00
retrieving news from: 2021-07-02 00:00:00
retrieving news from: 2021-07-01 00:00:00
retrieving news from: 2021-06-30 00:00:00
retrieving news from: 2021-06-29 00:00:00
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00
retrieving news from: 2021-06-24 00:00:00
retrieving news from: 2021-06-23 00:00:00
retrieving news from: 2021-06-22 00:00:00
retrieving news from: 2021-06-21 00:00:00
retrieving news from: 2021-06-20 00:00:00
retrieving news from: 2021-06-19 00:00:00
retrieving news from: 2021-06-18 00:00:00
retrieving news from: 2021-06-17 00:00:00
retrieving news from: 2021-06-16 00:00:00
retrieving news from: 2021-06-15 00:00:00
retrieving news from: 2021-06-14 00:00:0

In [358]:
# Instantiate SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [359]:
# Create function that computes average compound sentiment of headlines for each day
def headline_sentiment_summarizer_avg(headlines):
    sentiment = []
    for day in headlines:
        day_score = []
        for h in day:
            if h == None:
                continue
            else:
                day_score.append(sid.polarity_scores(h)["compound"])
        sentiment.append(sum(day_score) / len(day_score))
    return sentiment

In [360]:
# Get averages of each topics sentiment
goldman_avg = headline_sentiment_summarizer_avg(goldman_headlines)


In [361]:
# Combine Sentiment Averages into DataFrame
topic_sentiments = pd.DataFrame(
    {
        "goldman_avg": goldman_avg
    }
)

In [362]:
# Set the index value of the sentiment averages DataFrame to be the series of dates.
topic_sentiments.index = pd.to_datetime(dates)


In [392]:
# Merge with goldman returns
topic_sentiments = df.join(topic_sentiments).dropna(how="any")

# Display data
display(topic_sentiments)

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled,close,goldman_avg
2021-06-08,385.79,386.4799,382.22,384.8,1946972,0.675004,-0.544761,-0.008477,0.130225
2021-06-09,383.44,384.27,378.88,382.78,1817307,1.02935,0.058596,-0.005249,0.03605
2021-06-10,389.08,389.64,372.345,373.75,3302197,0.377338,-1.051608,-0.023591,0.135205
2021-06-11,375.47,378.75,375.11,378.23,1692723,0.831756,-0.277854,0.011987,0.08183
2021-06-14,377.43,378.75,370.7,373.07,2023651,0.639331,-0.605503,-0.013642,0.014035
2021-06-15,373.5,374.84,367.16,371.6,2028770,1.209282,0.364974,-0.00394,0.15956
2021-06-16,370.99,374.0788,365.25,371.07,2673244,1.593429,1.019075,-0.001426,0.084235
2021-06-17,373.52,373.52,356.55,361.5,3677802,1.388305,0.669802,-0.02579,0.10216
2021-06-18,356.72,358.38,348.125,348.79,4364156,0.191023,-1.368853,-0.035159,0.16972
2021-06-21,352.59,357.97,351.04,357.68,2295332,1.891522,1.526649,0.025488,0.34239


In [365]:
## Build and Train the LSTM RNN

In [393]:
# This function accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y
def window_data(df, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i:(i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [394]:
# Predict Closing Prices using a 10 day window of previous fng values
# Then, experiment with window sizes anywhere from 1 to 10 and see how the model performance changes
window_size = 10

# Column index 1 is the 'Goldman Average' column
# Column index 0 is the `Close` column
feature_column = 1
target_column = 0
X, y = window_data(topic_sentiments, window_size, feature_column, target_column)

In [444]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [445]:
scaler = StandardScaler()
X_train_std= scaler.fit_transform(X_train)

In [446]:

X_train_std

array([[ 0.67553441,  0.58533156,  1.87518123,  1.20413613,  0.97515972,
         0.9037415 ,  1.25759978,  1.92316125, -0.74458349, -1.17544653],
       [ 0.16109449,  1.62613634, -0.08303258,  1.20413613,  0.41741533,
         0.80660726,  1.18214142, -0.70612007, -0.83086931, -0.83015127],
       [ 1.41117022, -0.48454599, -0.08303258, -0.50034004,  0.30883348,
         0.73530061, -0.86231031, -0.77732253, -0.42890366, -0.36011584],
       [-1.12389956, -0.48454599, -0.78611945, -0.83216799,  0.22912311,
        -1.1966653 , -0.91767525, -0.44562324,  0.11827471,  1.15846017],
       [-1.12389956, -1.24237592, -0.92299662, -1.07576422, -1.93053165,
        -1.24898406, -0.65975565,  0.00590459,  1.88608175,  1.20725347]])

In [447]:
X_test


array([[374.84  , 374.0788, 373.52  , 358.38  , 357.97  , 359.88  ,
        362.48  , 370.88  , 371.1499, 368.87  ],
       [374.0788, 373.52  , 358.38  , 357.97  , 359.88  , 362.48  ,
        370.88  , 371.1499, 368.87  , 378.09  ],
       [373.52  , 358.38  , 357.97  , 359.88  , 362.48  , 370.88  ,
        371.1499, 368.87  , 378.09  , 380.11  ]])

In [448]:

from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=5, bootstrap=0, n_estimators=1000)

In [439]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=5, bootstrap=0, n_estimators=1000)

model=randomforest.fit(X_train_std, y_train)
y_test_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

  model=randomforest.fit(X_train_std, y_train)


ValueError: Unknown label type: 'continuous'

In [None]:
forecast_col = 'stock__change'
forecast_out = int(math.ceil(0.013 * len(topic_sentiments)))
topic_sentiments['stock_change_pred'] = topic_sentiments[forecast_col].shift(-forecast_out)


In [None]:
topic_sentiments['buy_sell'] = topic_sentiments['stock_change_pred'].apply(lambda x: 1 if x >=0 else -1)