In [10]:
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import finnhub

import yfinance as yf

import time
from textblob import TextBlob
import os
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
ticker = "AAPL"
start_date = datetime(2020, 1, 1)
end_date = datetime(2022, 7, 30)


stock_info = yf.download([ticker], start=start_date, end=end_date)

stock_info.drop(columns=["Close"], inplace=True)
stock_info.rename(columns={"Adj Close": "Close"}, inplace=True)
stock_info.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.059998,75.150002,73.797501,73.449394,135480400
2020-01-03,74.287498,75.144997,74.125,72.735321,146322800
2020-01-06,73.447502,74.989998,73.1875,73.314896,118387200
2020-01-07,74.959999,75.224998,74.370003,72.970085,108872000
2020-01-08,74.290001,76.110001,74.290001,74.14389,132079200


In [12]:
predictors = [
    "Close",
    "High",
    "Low",
    "Open",
    "Volume",
    "Weekly Average",
    "Quarterly Average",
]
# make sure to add any new predictors into the predictors variable

stock_info["Weekly Average"] = stock_info["Close"].rolling(7).mean()
stock_info["Quarterly Average"] = stock_info["Close"].rolling(91).mean()
stock_info['Target'] = stock_info.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])['Close']
stock_info.dropna(inplace=True)
stock_info.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Weekly Average,Quarterly Average,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-05-12,79.457497,79.922501,77.727501,76.541443,162301200,74.734664,71.013864,0.0
2020-05-13,78.037498,78.987503,75.802498,75.617271,200622400,75.271238,71.037687,0.0
2020-05-14,76.127502,77.447502,75.3825,76.081825,158929200,75.720097,71.074462,1.0
2020-05-15,75.087502,76.974998,75.052498,75.632004,166348400,75.997192,71.099924,0.0
2020-05-18,78.292503,79.125,77.580002,77.414001,135178400,76.419951,71.148759,1.0


In [13]:
FINNHUB_TOKEN = os.environ['FINNHUB_API_KEY']
# only hardcode your token into your code like this if you’re sure it won’t be shared with others


def finnhub_data(ticker, start, end, delta):
    client = finnhub.Client(api_key=FINNHUB_TOKEN)

    delta = timedelta(days=delta)

    df = pd.DataFrame(columns=["date", "headline", "summary"])
    calls = 0

    while start <= end:
        news = client.company_news(
            ticker,
            _from=start.strftime("%Y-%m-%d"),
            to=(start + delta - timedelta(days=1)).strftime("%Y-%m-%d"),
        )
        calls += 1
        for item in news:
            sub_dict = {
                "date": [date.fromtimestamp(item["datetime"])],
                "headline": [item["headline"]],
                "summary": [item["summary"]],
            }
            row = pd.DataFrame.from_dict(sub_dict)
            df = pd.concat([df, row], ignore_index=True)
        start += delta
        if calls % 10 == 0:
            print(str(calls) + "api calls")
            time.sleep(10)

    return df

In [14]:
def avg_sentiment(strings):
    total = 0
    for string in strings:
        blob = TextBlob(string)
        total += blob.sentiment.polarity
    if len(strings) != 0:
        return total / len(strings)
    else:
        return 0

In [15]:
predictors = [
    "Close",
    "High",
    "Low",
    "Open",
    "Volume",
    "Weekly Average",
    "Quarterly Average",
    "Sentiment",
]
# make sure to add any new predictors into the predictors variable

stock_info["Sentiment"] = ""
news = finnhub_data(ticker, start_date, end_date, 3)
for date in stock_info.index:
    daily = news.copy().loc[news["date"] == date.date()]
    if daily.empty:
        stock_info.loc[date, "Sentiment"] = 0.1
    else:
        stock_info.loc[date, "Sentiment"] = avg_sentiment(daily["headline"].tolist())

10api calls
20api calls
30api calls
40api calls
50api calls
60api calls
70api calls
80api calls
90api calls
100api calls
110api calls
120api calls
130api calls
140api calls
150api calls
160api calls
170api calls
180api calls
190api calls
200api calls
210api calls
220api calls
230api calls
240api calls
250api calls
260api calls
270api calls
280api calls
290api calls
300api calls
310api calls


In [16]:
stock_info.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Weekly Average,Quarterly Average,Target,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-12,79.457497,79.922501,77.727501,76.541443,162301200,74.734664,71.013864,0.0,0.1
2020-05-13,78.037498,78.987503,75.802498,75.617271,200622400,75.271238,71.037687,0.0,0.1
2020-05-14,76.127502,77.447502,75.3825,76.081825,158929200,75.720097,71.074462,1.0,0.1
2020-05-15,75.087502,76.974998,75.052498,75.632004,166348400,75.997192,71.099924,0.0,0.1
2020-05-18,78.292503,79.125,77.580002,77.414001,135178400,76.419951,71.148759,1.0,0.1
