In [1]:
# Import Libraries
import pandas as pd 
import numpy as np 
import yfinance as yf
from datetime import datetime, timedelta
import os
from newsapi import NewsApiClient
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dotenv import load_dotenv, find_dotenv
load_dotenv("newsapi.env")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter
import talib

### Stock Data Pull

In [2]:
# Get S&P500 symbols ranked by market cap 
sp500_by_marketcap = pd.read_csv('SP500_By_MarketCap.csv')
sp500_by_marketcap.head()

Unnamed: 0,Symbol,Name,Industry,Market Cap
0,AAPL,Apple Inc.,Electronic Technology,"$ 2,986,128,347,290.24"
1,MSFT,Microsoft Corporation,Technology Services,"$ 2,513,296,516,647.36"
2,GOOG,Alphabet Inc.,Technology Services,"$ 1,927,101,773,229.48"
3,GOOGL,Alphabet Inc.,Technology Services,"$ 1,923,705,624,039.54"
4,AMZN,"Amazon.com, Inc.",Retail Trade,"$ 1,728,404,755,739.39"


In [3]:
# Get tickers in a list

tickers = sp500_by_marketcap['Symbol'].head(2).tolist()
print(tickers)

['AAPL', 'MSFT']


In [4]:
# Set timeframe 
delta = 90
end = datetime.now()
start = datetime.now() - timedelta(delta)

In [5]:
# Get stock data from yfinance 
df_stock_data=pd.DataFrame()
for ticker in tickers:
    ticker_df = yf.download(ticker, start=start, end=end,interval="1D")
    ticker_df = ticker_df.reset_index()
    ticker_df["Stock"] = ticker
    # Creating a unique key by combining Ticker and Publish date
    ticker_df["unique_key"] = ticker_df["Stock"]+ticker_df["Date"].astype(str)
    df_stock_data =pd.concat([df_stock_data,ticker_df],axis=0)
df_stock_data = df_stock_data.set_index("unique_key")
df_stock_data.head()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume,Stock
unique_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAPL2022-01-13,2022-01-13,175.779999,176.619995,171.789993,172.190002,171.970901,84505800,AAPL
AAPL2022-01-14,2022-01-14,171.339996,173.779999,171.089996,173.070007,172.849792,80440800,AAPL
AAPL2022-01-18,2022-01-18,171.509995,172.539993,169.410004,169.800003,169.583939,90956700,AAPL
AAPL2022-01-19,2022-01-19,170.0,171.080002,165.940002,166.229996,166.018478,94815000,AAPL
AAPL2022-01-20,2022-01-20,166.979996,169.679993,164.179993,164.509995,164.300659,91420500,AAPL


In [6]:
# Create "Returns" column 
returns_df = pd.DataFrame()
returns_df = pd.DataFrame(df_stock_data["Adj Close"].pct_change().shift(-1))
returns_df = returns_df.rename(columns = {"Adj Close": "Returns"})
returns_df

Unnamed: 0_level_0,Returns
unique_key,Unnamed: 1_level_1
AAPL2022-01-13,0.005111
AAPL2022-01-14,-0.018894
AAPL2022-01-18,-0.021025
AAPL2022-01-19,-0.010347
AAPL2022-01-20,-0.012765
...,...
MSFT2022-04-06,0.006244
MSFT2022-04-07,-0.014600
MSFT2022-04-08,-0.039432
MSFT2022-04-11,-0.011218


In [7]:
df_stock_data = pd.concat([df_stock_data,returns_df],join = 'outer',axis = 1)
df_stock_data

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume,Stock,Returns
unique_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL2022-01-13,2022-01-13,175.779999,176.619995,171.789993,172.190002,171.970901,84505800,AAPL,0.005111
AAPL2022-01-14,2022-01-14,171.339996,173.779999,171.089996,173.070007,172.849792,80440800,AAPL,-0.018894
AAPL2022-01-18,2022-01-18,171.509995,172.539993,169.410004,169.800003,169.583939,90956700,AAPL,-0.021025
AAPL2022-01-19,2022-01-19,170.000000,171.080002,165.940002,166.229996,166.018478,94815000,AAPL,-0.010347
AAPL2022-01-20,2022-01-20,166.979996,169.679993,164.179993,164.509995,164.300659,91420500,AAPL,-0.012765
...,...,...,...,...,...,...,...,...,...
MSFT2022-04-06,2022-04-06,305.190002,307.000000,296.709991,299.500000,299.500000,40110400,MSFT,0.006244
MSFT2022-04-07,2022-04-07,296.660004,303.649994,296.350006,301.369995,301.369995,31411200,MSFT,-0.014600
MSFT2022-04-08,2022-04-08,300.440002,301.119995,296.279999,296.970001,296.970001,24347400,MSFT,-0.039432
MSFT2022-04-11,2022-04-11,291.790009,292.609985,285.000000,285.260010,285.260010,34569300,MSFT,-0.011218


### Sentiment Analysis

In [8]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\denis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
load_dotenv()
find_dotenv()

Python-dotenv could not parse statement starting at line 7


'C:\\Users\\denis\\Desktop\\Fintech\\NW_Fintech_Project3\\.env'

In [10]:
# Reading the News API key enviroment variable
api_key = os.getenv("news_api")
type(api_key)

str

In [11]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)

In [12]:
news_df = pd.DataFrame(index = pd.RangeIndex(start = 0, stop = 100))

for ticker in tickers:
    ticker_headlines = newsapi.get_everything(q=ticker,
                                              language="en",
                                              page_size=100,
                                              sort_by="relevancy"
                                             )
    ticker_sentiments = []
    
    for article in ticker_headlines["articles"]:
        try:
            text = article["content"]
            date = article["publishedAt"][:10]
            sentiment = analyzer.polarity_scores(text)
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]

            ticker_sentiments.append({
                "text": text,
                "Stock" : ticker,
                "date": date,
                "compound": compound,
                "positive": pos,
                "negative": neg,
                "neutral": neu

            })

        except AttributeError:
            pass
# Create DataFrame
    ticker_df = pd.DataFrame(ticker_sentiments)
    
# Reorder DataFrame columns
cols = ["date", "Stock", "text", "compound", "positive", "negative", "neutral"]
ticker_df = ticker_df[cols]
ticker_df.head()

Unnamed: 0,date,Stock,text,compound,positive,negative,neutral
0,2022-03-21,MSFT,One characteristic of a high-quality stock is ...,0.5106,0.102,0.0,0.898
1,2022-03-20,MSFT,Two tried and true ways to tame market uncerta...,0.7269,0.203,0.057,0.74
2,2022-04-06,MSFT,IZEA Worldwide (IZEA) has garnered significant...,0.5346,0.154,0.0,0.846
3,2022-03-23,MSFT,"Microsoft says the hacking group LAPSUS$, whic...",-0.1779,0.068,0.087,0.845
4,2022-03-23,MSFT,Rugged mobile devices provider Sonim Technolog...,0.8225,0.229,0.0,0.771


In [14]:
ticker_df.tail()

Unnamed: 0,date,Stock,text,compound,positive,negative,neutral
95,2022-04-08,MSFT,HJBC/iStock Editorial via Getty Images\r\nMicr...,-0.0258,0.0,0.034,0.966
96,2022-04-07,MSFT,Mario Tama/Getty Images News\r\nApple (NASDAQ:...,0.7184,0.212,0.0,0.788
97,2022-04-01,MSFT,When it comes to supremacy in the video game i...,-0.5267,0.062,0.121,0.817
98,2022-03-14,MSFT,Apple (AAPL) - Get Apple Inc. Report stock is...,0.0,0.0,0.0,1.0
99,2022-04-03,MSFT,Jeff Bezos became a billionaire and one of the...,0.6249,0.124,0.0,0.876


### Techincal Indicator Preparation

In [None]:
# Simple Moving Average for 90 day period
sma = talib.SMA(df_stock_data['Adj Close'], 90)
sma

In [None]:
# Exponential Moving Average for 90 day period
ema = talib.EMA(df_stock_data['Adj Close'], timeperiod=90)
ema

In [None]:
# Momentum for 90 day period
mom = talib.MOM(df_stock_data['Adj Close'],timeperiod=90)
mom

In [None]:
# Average Directional Movement
adx = talib.ADX(df_stock_data['High'],df_stock_data['Low'],df_stock_data['Adj Close'],timeperiod=90)
adx

In [None]:
# Normalized Average True Range for 90 day period
natr = talib.NATR(df_stock_data['High'],df_stock_data['Low'],df_stock_data['Adj Close'],timeperiod=90)
natr

In [None]:
# Linear Regression for 90 day period
linreg = talib.LINEARREG(df_stock_data['Adj Close'], timeperiod=90)
linreg

In [None]:
# Hilbert Transform Trend vs Cycle Mode
httrend = talib.HT_TRENDMODE(df_stock_data['Adj Close'])
httrend

In [None]:
# Relative Strength for 90 day period
rsi = talib.RSI(df_stock_data['Adj Close'], timeperiod=90)
rsi

In [None]:
# Typical Price 
typprice = talib.TYPPRICE(df_stock_data['High'],df_stock_data['Low'],df_stock_data['Adj Close'])
typprice

In [None]:
# MFI- Money Flow Index
mfi = talib.MFI(df_stock_data['High'],df_stock_data['Low'], df_stock_data['Adj Close'],df_stock_data['Volume'],timeperiod=90)
mfi

In [None]:
# ADOSC-Chaikin A/D Oscillator
adosc = talib.ADOSC(df_stock_data['High'],df_stock_data['Low'],df_stock_data['Adj Close'],df_stock_data['Volume'],fastperiod=3,slowperiod=10)
adosc

In [None]:
# Hilbert Transform - Dominant Cycle Period
domcycle= talib.HT_DCPERIOD(df_stock_data['Adj Close'])
domcycle

In [None]:
# Hilbert Transform - Dominant Cycle Phase
dom_cyc_phs = talib.HT_DCPHASE(df_stock_data['Adj Close'])
dom_cyc_phs

In [None]:
# Create DataFrame
indicator_df = pd.DataFrame

### Modeling Data Preperation

In [13]:
# Preparing Dataset X & y

target = 

X = final_df.copy()
X.drop(target, axis=1, inplace=True)
X.head()

y = all_df[target]
y.head()

SyntaxError: invalid syntax (<ipython-input-13-3d6ece077860>, line 3)

In [None]:
# Preparing Test & Train Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
#Check if there is a class imbalance 
Counter(y_train) 

In [None]:
# Scale data 
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Logistic Regression

In [None]:
# Fit a logistic regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred_lr = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred_lr)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

### Random Forest

In [None]:
# Fit a Random Forest Classifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train, y_train)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred_brf = model.predict(X_test)
confusion_matrix(y_test, y_pred_brf)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred_brf))

In [None]:
# List the features sorted in descending order by feature importance
importances_sorted_brf = sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)
importances_sorted_brf

### Gradient Boost

In [None]:
# Choose learning rate
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=learning_rate,
        max_features=2,
        max_depth=3,
        random_state=0)
    model.fit(X_train_scaled,y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        model.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        model.score(
            X_test_scaled,
            y_test.ravel())))
    print()

In [None]:
# Create GradientBoostingClassifier model
model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=1,
    max_features=5,
    max_depth=3,
    random_state=0)

# Fit the model
model.fit(X_train_scaled,y_train.ravel())

# Score the model
print("Accuracy score (training): {0:.3f}".format(
    model.score(
        X_train_scaled,
        y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    model.score(
        X_test_scaled,
        y_test)))

In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)

# Generate accuracy score for predictions using y_test
accuracy_score(y_test, predictions)

In [None]:
# Generate classification report
print(classification_report(y_test, predictions))