In [15]:
# Initial imports
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import nltk as nltk
from wordcloud import WordCloud
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient
load_dotenv()
import alpaca_trade_api as tradeapi
from datetime import datetime, timedelta
import math
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import StandardScaler

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bfode\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
# Read your api key environment variable
# YOUR CODE HERE!
api_key = os.getenv("news_api")

In [17]:
# Create a newsapi client
# YOUR CODE HERE!
newsapi = NewsApiClient(api_key=api_key)

In [18]:
# Load .env enviroment variables
load_dotenv()

# Set News API Key
newsapi = NewsApiClient(api_key=os.environ["news_api"])

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [19]:
# Get last 30days' worth of historical data for GS 
#Wells Fargo Co. (WFC), Goldman Sachs Group Inc. (GS), and Morgan Stanley (MS).

# Set the ticker
ticker = "GS"

# Set timeframe to '1D'
timeframe = "1D"

# Set current date and the date from one month ago using the ISO format
current_date = pd.Timestamp(datetime.now(), tz="America/New_York").isoformat()
past_date = pd.Timestamp(datetime.now()- timedelta(30), tz="America/New_York").isoformat()

# Get 4 weeks worth of historical data for AAPL
df = api.get_barset(
    ticker,
    timeframe,
    limit=None,
    start=past_date,
    end=current_date,
    after=None,
    until=None,
).df

# Display data
df.tail()

Unnamed: 0_level_0,GS,GS,GS,GS,GS
Unnamed: 0_level_1,open,high,low,close,volume
2021-06-29 00:00:00-04:00,374.75,378.09,370.7495,372.62,2151889
2021-06-30 00:00:00-04:00,370.9,380.11,370.9,379.445,1823472
2021-07-01 00:00:00-04:00,380.55,381.64,374.145,374.96,2290040
2021-07-02 00:00:00-04:00,376.31,376.6,372.42,374.16,1385037
2021-07-06 00:00:00-04:00,373.23,373.6313,365.405,369.86,1858520


In [86]:
#Saving Raw data to Raw Data Folder
df.to_csv('../../data/Rawdata/gs_stock_1day.csv')

In [21]:
# Drop Outer Table Level
df = df.droplevel(axis=1, level=0)

# Use the drop function to drop extra columns
#df = df.drop(columns=["open", "high", "low", "volume"])

# Since this is daily data, we can keep only the date (remove the time) component of the data
df.index = df.index.date

# Display sample data
df.head()

Unnamed: 0,open,high,low,close,volume
2021-06-07,392.89,393.2603,387.55,388.09,1815955
2021-06-08,385.79,386.4799,382.22,384.8,1946972
2021-06-09,383.44,384.27,378.88,382.78,1817307
2021-06-10,389.08,389.64,372.345,373.75,3302197
2021-06-11,375.47,378.75,375.11,378.23,1692723


In [22]:
df.columns = ['High','Low','Open','Close','Volume']

In [23]:

df['stock_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

In [24]:
scaler = StandardScaler()
df['stock_change_scaled'] = scaler.fit_transform(df[['stock_change']])
df.head()

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled
2021-06-07,392.89,393.2603,387.55,388.09,1815955,0.139337,-1.506251
2021-06-08,385.79,386.4799,382.22,384.8,1946972,0.675004,-0.574854
2021-06-09,383.44,384.27,378.88,382.78,1817307,1.02935,0.041268
2021-06-10,389.08,389.64,372.345,373.75,3302197,0.377338,-1.092424
2021-06-11,375.47,378.75,375.11,378.23,1692723,0.831756,-0.3023


In [25]:
df_returns = df.pct_change().dropna()
df_returns.head()

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled
2021-06-08,-0.018071,-0.017242,-0.013753,-0.008477,0.072148,3.844403,-0.618354
2021-06-09,-0.006091,-0.005718,-0.008738,-0.005249,-0.066598,0.524954,-1.071788
2021-06-10,0.014709,0.013975,-0.017248,-0.023591,0.817083,-0.633421,-27.471619
2021-06-11,-0.03498,-0.027949,0.007426,0.011987,-0.487395,1.204272,-0.723276
2021-06-14,0.00522,0.0,-0.011757,-0.013642,0.1955,-0.231348,1.106784


In [26]:
# Use newsapi client to get most relevant 20 headlines per day in the past month
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=str(date)[:10],
            to=str(date)[:10],
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates

In [30]:
goldman_headlines, dates = get_headlines("Goldman Sachs Group Inc.")

Fetching news about 'Goldman Sachs Group Inc.'
******************************
retrieving news from: 2021-07-07 00:00:00
retrieving news from: 2021-07-06 00:00:00
retrieving news from: 2021-07-05 00:00:00
retrieving news from: 2021-07-04 00:00:00
retrieving news from: 2021-07-03 00:00:00
retrieving news from: 2021-07-02 00:00:00
retrieving news from: 2021-07-01 00:00:00
retrieving news from: 2021-06-30 00:00:00
retrieving news from: 2021-06-29 00:00:00
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00
retrieving news from: 2021-06-24 00:00:00
retrieving news from: 2021-06-23 00:00:00
retrieving news from: 2021-06-22 00:00:00
retrieving news from: 2021-06-21 00:00:00
retrieving news from: 2021-06-20 00:00:00
retrieving news from: 2021-06-19 00:00:00
retrieving news from: 2021-06-18 00:00:00
retrieving news from: 2021-06-17 00:00:00
retrieving news from: 2021-06-16 00:00:0

In [31]:
# Instantiate SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [32]:
# Create function that computes average compound sentiment of headlines for each day
def headline_sentiment_summarizer_avg(headlines):
    sentiment = []
    for day in headlines:
        day_score = []
        for h in day:
            if h == None:
                continue
            else:
                day_score.append(sid.polarity_scores(h)["compound"])
        sentiment.append(sum(day_score) / len(day_score))
    return sentiment

In [33]:
# Get averages of each topics sentiment
goldman_avg = headline_sentiment_summarizer_avg(goldman_headlines)


In [34]:
# Combine Sentiment Averages into DataFrame
topic_sentiments = pd.DataFrame(
    {
        "goldman_avg": goldman_avg
    }
)

In [35]:
# Set the index value of the sentiment averages DataFrame to be the series of dates.
topic_sentiments.index = pd.to_datetime(dates)


In [36]:
# Merge with goldman returns
topic_sentiments = df.join(topic_sentiments).dropna(how="any")

# Display data
display(topic_sentiments)

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled,goldman_avg
2021-06-08,385.79,386.4799,382.22,384.8,1946972,0.675004,-0.574854,0.130225
2021-06-09,383.44,384.27,378.88,382.78,1817307,1.02935,0.041268,0.03605
2021-06-10,389.08,389.64,372.345,373.75,3302197,0.377338,-1.092424,0.135205
2021-06-11,375.47,378.75,375.11,378.23,1692723,0.831756,-0.3023,0.08183
2021-06-14,377.43,378.75,370.7,373.07,2023651,0.639331,-0.636881,0.014035
2021-06-15,373.5,374.84,367.16,371.6,2028770,1.209282,0.354127,0.15956
2021-06-16,370.99,374.0788,365.25,371.07,2673244,1.593429,1.022067,0.084235
2021-06-17,373.52,373.52,356.55,361.5,3677802,1.388305,0.665404,0.10216
2021-06-18,356.72,358.38,348.125,348.79,4364156,0.191023,-1.416381,0.16972
2021-06-21,352.59,357.97,351.04,357.68,2295332,1.891522,1.540379,0.34239


# Build and Train Random Forest Regressor with a window specified

In [38]:
# This function accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y
def window_data(df, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i:(i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)



In [39]:
# Predict Closing Prices using a 10 day window of previous fng values
# Then, experiment with window sizes anywhere from 1 to 10 and see how the model performance changes
window_size = 10

# Column index 1 is the 'Goldman Average' column
# Column index 0 is the `Close` column
feature_column = 1
target_column = 0
X, y = window_data(topic_sentiments, window_size, feature_column, target_column)

In [40]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [65]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [66]:

X_train

array([[-1.27050997, -1.86746805, -1.80566387, -1.27997363, -0.6219017 ,
         0.66015175,  0.70291303,  0.56038819,  1.56394643,  1.60862927],
       [ 1.03853833,  1.32605984,  0.4238157 ,  0.98159516,  0.86297371,
         1.09330187,  1.06220827, -1.27567806, -1.14907759, -1.11080177],
       [ 1.51321422,  0.77745956,  1.59220032,  0.98159516,  1.33270371,
         1.1963761 ,  1.14691954,  1.37427837, -1.09379231, -1.36755478],
       [-0.14713113, -0.18591113, -0.07735586,  0.35477986, -1.11445746,
        -1.08799379, -1.00554712, -0.55805445,  0.59173454,  0.40416194],
       [-0.14713113,  0.21353525,  0.00431305,  0.42175202,  0.70439478,
        -1.03247561, -1.29509352, -1.01313283, -0.5409395 ,  0.36788045],
       [-0.98698032, -0.26367548, -0.13730933, -1.45974857, -1.16371304,
        -0.82936033, -0.6114002 ,  0.91219878,  0.62812844,  0.09768488]])

In [67]:
X_test

array([[358.38  , 357.97  , 359.88  , 362.48  , 370.88  , 371.1499,
        368.87  , 378.09  , 380.11  , 381.64  ],
       [389.64  , 378.75  , 378.75  , 374.84  , 374.0788, 373.52  ,
        358.38  , 357.97  , 359.88  , 362.48  ],
       [374.0788, 373.52  , 358.38  , 357.97  , 359.88  , 362.48  ,
        370.88  , 371.1499, 368.87  , 378.09  ]])

In [68]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(random_state = 10) 

In [69]:
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test) 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [70]:
#This is kind of working like balance accuracy score
random_forest.score(X_test,y_test)

-0.31177527267735683

In [71]:
 print(random_forest.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

[371.214]


In [72]:
display(topic_sentiments)

Unnamed: 0,High,Low,Open,Close,Volume,stock_change,stock_change_scaled,goldman_avg
2021-06-08,385.79,386.4799,382.22,384.8,1946972,0.675004,-0.574854,0.130225
2021-06-09,383.44,384.27,378.88,382.78,1817307,1.02935,0.041268,0.03605
2021-06-10,389.08,389.64,372.345,373.75,3302197,0.377338,-1.092424,0.135205
2021-06-11,375.47,378.75,375.11,378.23,1692723,0.831756,-0.3023,0.08183
2021-06-14,377.43,378.75,370.7,373.07,2023651,0.639331,-0.636881,0.014035
2021-06-15,373.5,374.84,367.16,371.6,2028770,1.209282,0.354127,0.15956
2021-06-16,370.99,374.0788,365.25,371.07,2673244,1.593429,1.022067,0.084235
2021-06-17,373.52,373.52,356.55,361.5,3677802,1.388305,0.665404,0.10216
2021-06-18,356.72,358.38,348.125,348.79,4364156,0.191023,-1.416381,0.16972
2021-06-21,352.59,357.97,351.04,357.68,2295332,1.891522,1.540379,0.34239


# Random Forest Regressor without a window specified

In [73]:

# Split the X and y into X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [74]:
# Create the StandardScaler instance
data_scaler = StandardScaler()

In [75]:
# Fit the Standard Scaler with the training data
X_scaler = data_scaler.fit(X_train)

In [76]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [77]:
# Resample the training data with the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(random_state = 10) 

In [78]:
random_forest.fit(X_train_scaled, y_train)
y_pred = random_forest.predict(X_test_scaled) 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [79]:
# This is reporting the r2 score 
random_forest.score(X_test_scaled, y_test)

0.7901725839569592

# Linear Regression model

In [57]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):

In [58]:
# Create the StandardScaler instance
#data_scaler = StandardScaler()

In [59]:
# Fit the Standard Scaler with the training data
#X_scaler = data_scaler.fit(X_train)

In [80]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
!
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [81]:
# Make a prediction of "y" values using just the test dataset

predicted_y_values = model.predict(X_test)

In [82]:
model.score(X, y)

0.7661404040835539

In [63]:
x_range = np.linspace(X.min(), X.max(), 10)
y_range = model.predict(x_range.reshape(-10,10))

In [83]:
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure([
    go.Scatter(x=X_train_scaled.squeeze(), y=y_train, name='train', mode='markers'),
    go.Scatter(x=X_test_scaled.squeeze(), y=y_test, name='test', mode='markers'),
    go.Scatter(x=x_range, y=y_range, name='prediction')
])
fig.show()