Uncommon dependencies

In [49]:
# ! pip install openai

# Current Application of Increase Prediction Model

This notebook applies the trained model (./model/increase_pred.pkl) to current day data. In order to rank the sentiment of the company, news articles are pulled from NewsAPI.com. Then headlines are fed into ChatGPT where they receive a ranking out of 100 on how positive the headline is for the company. Finally the sentiment data is combined with yahoo finance data, features are created, and the data is fed to the model. 

Note: At the moment, this has not yet been updated to work with any company and predicts strictly on GOOGL

# Get Headline Data for the past month

In [86]:
import requests
import json
import datetime

# get today's date
today = datetime.date.today()

# get the date from one month ago
one_month_ago = today - datetime.timedelta(days=30)

# format the date in year-month-day format
date_str = one_month_ago.strftime("%Y-%m-%d")

# set up the API endpoint URL
url = f"https://newsapi.org/v2/everything?q=google&from={date_str}&to={today}&sortBy=popularity&apiKey=2f014b5262fc406e8b9288e8d456284b"

# make the API request
response = requests.get(url)

In [87]:
response

<Response [200]>

In [88]:
results = json.loads(response.text)
# results

In [89]:
# parse the JSON response
articles = []
dates = []

# store headlines and date
for article in results["articles"]:
    # print(article["title"])
    articles.append(article['title'])
    # print(article['publishedAt'])
    dates.append(article['publishedAt'])

# results

### Create a new dataframe with headlines and correctly formatted dates

In [90]:
from datetime import datetime
import pandas as pd

formatted_dates = []

for date in dates:
    date = date[:10] # remove unwanted last 15 characters 
    formatted_dates.append(date)

formatted_dates
date_objs = [datetime.strptime(date_str, '%Y-%m-%d') for date_str in formatted_dates]
date_objs

df = pd.DataFrame(list(zip(articles, date_objs)), columns=['Articles', 'Dates'])

num_unique = df.nunique()

print(num_unique)


Articles    100
Dates        26
dtype: int64


### Filter out Unrelated Words

In [91]:
# and the list of words is stored in a variable called "keywords"
keywords = ['Google', 'Alphabet', 'Search', 'Advertising', 'Android', 'YouTube', 'Chrome', 'Maps', 'Gmail', 'Pixel', 'Assistant', 'Cloud', 'Drive', 'Play', 'Nexus', 'Chromebook']

# filter the rows that contain at least one of the keywords
df = df[df['Articles'].str.contains('|'.join(keywords))]

# ChatGPT Sentiment Ranking
Gets Responses from ChatGPT and has it rank headlines

In [92]:
# Authenticate with the OpenAI API
import openai
import time

#open ai api call removed for safety


count = 0
start_time = time.time()
for index, row in df.iterrows():
    if count < 60: 
        count += 1
        prompt = f"On a scale of 0-100, how positive would you rate this headline for the related company? Be specific with your scale to the ones place. Headline: {row['Articles']}"

        response = openai.Completion.create(
        engine="text-davinci-001",
        prompt=prompt,
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.7,
        )
        
        message = response.choices[0].text
        df.loc[index, 'Sentiment'] = message
        
    else: 
        end_time = time.time()
        count = 0
        elapsed_time = end_time - start_time
        print(elapsed_time)
        time.sleep(60 - elapsed_time)
        start_time = time.time()





### Format sentiment results...
- Get rid of newline characters
- Remove character responses that ChatGPT sometimes generates

In [93]:
df['Sentiment'] = df['Sentiment'].str.replace('\n', '')
df['Sentiment'] = df['Sentiment'].apply(lambda x: ''.join(filter(str.isdigit, str(x))))
df

Unnamed: 0,Articles,Dates,Sentiment
0,Five Things Scammers Are Hoping You Google,2023-05-04,35
8,Google officially reveals the Pixel Fold,2023-05-04,75
9,Google reportedly halts construction of its gi...,2023-04-21,50
10,Google Authenticator finally syncs one-time co...,2023-04-24,70
11,'Godfather of AI' leaves Google amid ethical c...,2023-05-01,75
12,Google Maps is expanding Immersive View to routes,2023-05-10,75
13,Google Search Generative Experience preview: A...,2023-05-10,75
14,Google will reportedly release Pixel Watch 2 t...,2023-05-05,75
15,Google rolls out support for passkeys across i...,2023-05-03,60
...,...,...,...


In [94]:
num_unique = df.nunique()

print(num_unique)

Articles     49
Dates        21
Sentiment    15
dtype: int64


### Combine sentiment and yahoo finance data 

In [95]:
from pandas_datareader import data as pdr
from datetime import date, timedelta
import yfinance as yf
import functions as funct

In [96]:
#Get Stock info from Yahoo Finance
yf.pdr_override()
ticker_symbol = 'GOOGL'
data = pdr.get_data_yahoo(ticker_symbol, start=one_month_ago, end=today)

[*********************100%***********************]  1 of 1 completed


In [97]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-17,104.660004,106.160004,104.519997,105.970001,105.970001,37571200
2023-04-18,106.489998,106.540001,104.070000,104.500000,104.500000,26596400
2023-04-19,103.580002,104.980003,103.070000,104.180000,104.180000,20905700
2023-04-20,103.910004,106.250000,103.870003,105.290001,105.290001,27820800
2023-04-21,105.470001,106.000000,104.779999,105.410004,105.410004,25800100
2023-04-24,105.489998,106.629997,104.699997,105.970001,105.970001,23542800
2023-04-25,105.830002,106.690002,103.839996,103.849998,103.849998,46664100
2023-04-26,104.919998,106.349998,102.629997,103.709999,103.709999,53347600
2023-04-27,104.449997,108.370003,103.540001,107.589996,107.589996,50089200
...,...,...,...,...,...,...


### Merge the two dataframes
This involves making sure that the indices are of the same type

In [98]:
df.set_index('Dates', inplace=True) # set "date" column as index
df.index = pd.to_datetime(df.index)
print(type(data.index))
print(type(df.index))

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [99]:
result = data.merge(df, left_index=True, right_index=True)

In [100]:
result

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Articles,Sentiment
2023-04-17,104.660004,106.160004,104.519997,105.970001,105.970001,37571200,Google's Pixel 7a may cost $50 more than its p...,68
2023-04-18,106.489998,106.540001,104.070000,104.500000,104.500000,26596400,All the rumors about Google’s upcoming Pixel h...,75
2023-04-19,103.580002,104.980003,103.070000,104.180000,104.180000,20905700,Google Fi adds a one-week eSIM trial and 'Wire...,75
2023-04-19,103.580002,104.980003,103.070000,104.180000,104.180000,20905700,Google Meet now lets you turn off distracting ...,75
2023-04-20,103.910004,106.250000,103.870003,105.290001,105.290001,27820800,Google reportedly plans to let companies use A...,50
2023-04-21,105.470001,106.000000,104.779999,105.410004,105.410004,25800100,Google reportedly halts construction of its gi...,50
2023-04-21,105.470001,106.000000,104.779999,105.410004,105.410004,25800100,The Chromecast with Google TV (HD) is down to ...,88
2023-04-24,105.489998,106.629997,104.699997,105.970001,105.970001,23542800,Google Authenticator finally syncs one-time co...,70
2023-04-24,105.489998,106.629997,104.699997,105.970001,105.970001,23542800,Chromebooks' short lifespans are creating 'pil...,0
...,...,...,...,...,...,...,...,...


Now I gotta clean the data for the model, which involves...
- removing the articles column 
- making sure there is only one article per day
- and making sure GPT didn't make bad sentiment guesses

In [101]:
result.drop('Articles', inplace=True, axis=1)

# cant convert '' to int
result = result[result['Sentiment'] != '']

result['Sentiment'] = result['Sentiment'].astype('int')

# make sure sentiment values are reasonable 
result = result[result['Sentiment'] <= int(100)]
result = result[result['Sentiment'] >= 0]


In [102]:
# check if index values are unique
if not result.index.is_unique:
    # drop any duplicate rows
    result = result[~result.index.duplicated(keep='first')]

In [103]:
result

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Sentiment
2023-04-17,104.660004,106.160004,104.519997,105.970001,105.970001,37571200,68
2023-04-18,106.489998,106.540001,104.07,104.5,104.5,26596400,75
2023-04-19,103.580002,104.980003,103.07,104.18,104.18,20905700,75
2023-04-20,103.910004,106.25,103.870003,105.290001,105.290001,27820800,50
2023-04-21,105.470001,106.0,104.779999,105.410004,105.410004,25800100,50
2023-04-24,105.489998,106.629997,104.699997,105.970001,105.970001,23542800,70
2023-04-25,105.830002,106.690002,103.839996,103.849998,103.849998,46664100,75
2023-04-26,104.919998,106.349998,102.629997,103.709999,103.709999,53347600,91
2023-04-28,107.040001,107.349998,105.089996,107.339996,107.339996,36139800,75
2023-05-01,106.839996,107.989998,106.82,107.199997,107.199997,26681700,75


### Create Features
This also includes dropping the ones that weren't as useful

In [104]:
result = funct.create_daily_return(result)
result = funct.create_top_feature_ratios(result)
result = funct.create_lag_featues(result)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Sentiment,daily_return,Sentiment/volume,dr/volume,...,Volume_lag5,Volume_lag6,Volume_lag7,daily_return_lag1,daily_return_lag2,daily_return_lag3,daily_return_lag4,daily_return_lag5,daily_return_lag6,daily_return_lag7
2023-04-17,104.660004,106.160004,104.519997,105.970001,105.970001,37571200,68,1.309998,1.809897e-06,3.486707e-08,...,,,,,,,,,,
2023-04-18,106.489998,106.540001,104.07,104.5,104.5,26596400,75,-1.989998,2.819931e-06,-7.482208e-08,...,,,,1.309998,,,,,,
2023-04-19,103.580002,104.980003,103.07,104.18,104.18,20905700,75,0.599998,3.587538e-06,2.870023e-08,...,,,,-1.989998,1.309998,,,,,
2023-04-20,103.910004,106.25,103.870003,105.290001,105.290001,27820800,50,1.379997,1.797216e-06,4.960308e-08,...,,,,0.599998,-1.989998,1.309998,,,,
2023-04-21,105.470001,106.0,104.779999,105.410004,105.410004,25800100,50,-0.059998,1.937977e-06,-2.325478e-09,...,,,,1.379997,0.599998,-1.989998,1.309998,,,
2023-04-24,105.489998,106.629997,104.699997,105.970001,105.970001,23542800,70,0.480003,2.973308e-06,2.038854e-08,...,37571200.0,,,-0.059998,1.379997,0.599998,-1.989998,1.309998,,
2023-04-25,105.830002,106.690002,103.839996,103.849998,103.849998,46664100,75,-1.980003,1.607231e-06,-4.243098e-08,...,26596400.0,37571200.0,,0.480003,-0.059998,1.379997,0.599998,-1.989998,1.309998,
2023-04-26,104.919998,106.349998,102.629997,103.709999,103.709999,53347600,91,-1.209999,1.705794e-06,-2.268142e-08,...,20905700.0,26596400.0,37571200.0,-1.980003,0.480003,-0.059998,1.379997,0.599998,-1.989998,1.309998
2023-04-28,107.040001,107.349998,105.089996,107.339996,107.339996,36139800,75,0.299995,2.075274e-06,8.300971e-09,...,27820800.0,20905700.0,26596400.0,-1.209999,-1.980003,0.480003,-0.059998,1.379997,0.599998,-1.989998
2023-05-01,106.839996,107.989998,106.82,107.199997,107.199997,26681700,75,0.360001,2.810915e-06,1.349242e-08,...,25800100.0,27820800.0,20905700.0,0.299995,-1.209999,-1.980003,0.480003,-0.059998,1.379997,0.599998


In [105]:
result.drop(['Low', 'Open_lag2', 'Open_lag3', 'Open_lag7', 'High', 'Close', 'Open_lag5', 'Open_lag4', 'Sentiment_lag2', 'Volume_lag7', 'Open', 'Open_lag1', 'Open_lag6'], axis=1, inplace=True)

In [106]:
result.columns

Index(['Adj Close', 'Volume', 'Sentiment', 'daily_return', 'Sentiment/volume',
       'dr/volume', 'Sentiment_lag1', 'Sentiment_lag3', 'Sentiment_lag4',
       'Sentiment_lag5', 'Sentiment_lag6', 'Sentiment_lag7', 'Volume_lag1',
       'Volume_lag2', 'Volume_lag3', 'Volume_lag4', 'Volume_lag5',
       'Volume_lag6', 'daily_return_lag1', 'daily_return_lag2',
       'daily_return_lag3', 'daily_return_lag4', 'daily_return_lag5',
       'daily_return_lag6', 'daily_return_lag7'],
      dtype='object')

### Uses data from the last day for the models prediction

In [107]:
last_day = result.iloc[-1]

### Format Data and Deserialize the model

In [108]:
input = last_day.to_frame()
input = input.transpose()
input = input.astype(float)

In [109]:
input

Unnamed: 0,Adj Close,Volume,Sentiment,daily_return,Sentiment/volume,dr/volume,Sentiment_lag1,Sentiment_lag3,Sentiment_lag4,Sentiment_lag5,...,Volume_lag4,Volume_lag5,Volume_lag6,daily_return_lag1,daily_return_lag2,daily_return_lag3,daily_return_lag4,daily_return_lag5,daily_return_lag6,daily_return_lag7
2023-05-12,117.510002,41073800.0,78.0,0.830002,2e-06,2.020757e-08,70.0,0.0,75.0,35.0,...,26625100.0,23419500.0,21795400.0,1.169998,3.779999,-1.040001,0.75,-0.799995,-0.119995,-1.82


In [110]:
import pickle
import os

# specify the path to the saved model file
model_path = './model/increase_pred.pkl'

# load the saved model using pickle
with open(model_path, 'rb') as f:
    model = pickle.load(f)

# use the loaded model to make predictions
predictions = model.predict(input)

In [116]:
# Check if the array contains a 1 or 0
contains_1 = predictions[0] == 1
contains_0 = predictions[0] == 0

if contains_1:
    print("Opening price is predicting to increase tomorrow! : ) ")
if contains_0:
    print("Opening price is NOT predicted to be an increase : ( ")

Opening price is NOT predicted to be an increase : ( 
