# Generating Model Data

The main purpose of this notebook is to use our new dataset (located in data/motley-fool-data.pkl) to create new training data for our model.

In [119]:
import importlib
import pandas as pd
import requests
import yfinance as yf
import os
import re
from datetime import timedelta, datetime
import time
from transformers import pipeline


import sys
project_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_dir)


from models import time_series
import stockinfo as si
importlib.reload(si)
from models import sentiment
# Reload to see changes


In [140]:
# Globals
EARNINGS_CALL_FILE = "../../data/motley-fool-data.pkl"
SENTIMENT_SAVE_LOCATION = "../../data/2021-motley-fool-sentiment.csv"

In [154]:
# Load in dataset
earnings_call_df = pd.read_pickle(EARNINGS_CALL_FILE)

In [155]:
earnings_call_df

Unnamed: 0,date,exchange,q,ticker,transcript
0,"Aug 27, 2020, 9:00 p.m. ET",NASDAQ: BILI,2020-Q2,BILI,"Prepared Remarks:\nOperator\nGood day, and wel..."
1,"Jul 30, 2020, 4:30 p.m. ET",NYSE: GFF,2020-Q3,GFF,Prepared Remarks:\nOperator\nThank you for sta...
2,"Oct 23, 2019, 5:00 p.m. ET",NASDAQ: LRCX,2020-Q1,LRCX,Prepared Remarks:\nOperator\nGood day and welc...
3,"Nov 6, 2019, 12:00 p.m. ET",NASDAQ: BBSI,2019-Q3,BBSI,"Prepared Remarks:\nOperator\nGood day, everyon..."
4,"Aug 7, 2019, 8:30 a.m. ET",NASDAQ: CSTE,2019-Q2,CSTE,Prepared Remarks:\nOperator\nGreetings and wel...
...,...,...,...,...,...
18750,"Nov 9, 2021, 1:00 p.m. ET",NYSE: SWX,2021-Q3,SWX,Prepared Remarks:\nOperator\nLadies and gentle...
18751,"Nov 18, 2021, 12:00 p.m. ET",NYSE: PNNT,2021-Q4,PNNT,"Prepared Remarks:\nOperator\nGood morning, and..."
18752,"Feb 08, 2022, 11:00 a.m. ET",NYSE: TDG,2022-Q1,TDG,Prepared Remarks:\nOperator\nThank you for sta...
18753,"Feb 28, 2022, 4:30 p.m. ET",NASDAQ: DVAX,2021-Q4,DVAX,"Prepared Remarks:\nOperator\nGood day, ladies ..."


We're only going to use sp_500 earnings calls since this dataset is large, and it takes a while to generate sentiment scores. Within this subset, we're also going to use only 500 earnings calls.

In [172]:
# Filter out non-sp500 tickers
sp500_tickers = si.get_sp500_tickers()
sp500_earnings_call_df = earnings_call_df[earnings_call_df['ticker'].isin(sp500_tickers)]
# sp500_earnings_call_df = sp500_earnings_call_df[sp500_earnings_call_df['q'].str.startswith("2021")]


# Randomly select 500
sp500_earnings_call_small_df = sp500_earnings_call_df.sample(n=500, random_state=1)

# Shape the dataset so it works well with finbert handler
# Extract just the date portion and combine with ticker
sentiment_df = sp500_earnings_call_small_df.copy()
sentiment_df['ticker'] = sentiment_df['ticker'] + '_' + sentiment_df['date'].str.split(',').str[0] + sentiment_df['date'].str.split(',').str[1]
sentiment_df = sentiment_df[['ticker', 'transcript']].rename(columns={'transcript': 'text'})
sentiment_df['text'] = sentiment_df['text'].apply(si.clean_transcript)

In [None]:
# Load file if it exists, if not runs sentiment (could take upwards of an hour)
if os.path.exists(SENTIMENT_SAVE_LOCATION):
    sentiment_df = pd.read_csv(SENTIMENT_SAVE_LOCATION, index_col=0)

sentiment_df = sentiment.finbert_handler(sentiment_df)
sentiment_df.to_csv(SENTIMENT_SAVE_LOCATION, index=True)

Processing documents:   1%|          | 3/500 [00:11<31:37,  3.82s/document]


KeyboardInterrupt: 

In [130]:
sentiment_df

Unnamed: 0,ticker,finbert_sentiment,sentiment_intensity,mean_neg_prob,mean_neu_prob,mean_pos_prob
4351,AMZN_Jul 30 2020,positive,0.309188,0.063605,0.563601,0.372793
7691,DOC_Feb 25 2021,positive,0.272611,0.056223,0.614944,0.328834
14542,AXON_Nov 15 2021,positive,0.300484,0.040746,0.618025,0.341230
14036,OXY_Aug 04 2021,positive,0.311152,0.034610,0.619629,0.345761
10361,SPGI_Apr 28 2020,negative,0.144456,0.123690,0.608163,0.268147
...,...,...,...,...,...,...
9120,EQR_Oct 28 2020,negative,0.051895,0.091274,0.765557,0.143169
14726,LULU_Dec 09 2021,positive,0.383679,0.054998,0.506325,0.438677
9769,NWS_May. 9 2019,positive,0.266291,0.113507,0.506695,0.379798
13983,AIG_Aug 06 2021,positive,0.347043,0.036450,0.580057,0.383493


Here we do some date conversion to prepare it for the `collect_all_features` function.

In [131]:
earnings_calls_with_sentiment_df = sentiment_df.rename(columns={'ticker': 'sentiment_ticker'}).join(sp500_earnings_call_small_df[['date', 'ticker']])
# convert date to datetime format
earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
earnings_calls_with_sentiment_df.dropna(inplace=True)

# Convert date to start_date in YYYY-MM-DD format
earnings_calls_with_sentiment_df['start_date'] = earnings_calls_with_sentiment_df['date'].dt.strftime('%Y-%m-%d')

# Create end_date by adding 90 days to the date
earnings_calls_with_sentiment_df['prediction_date'] = (earnings_calls_with_sentiment_df['date'] + pd.Timedelta(days=90)).dt.strftime('%Y-%m-%d')

  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_format=True, errors='coerce')
  earnings_calls_with_sentiment_df['date'] = pd.to_datetime(earnings_calls_with_sentiment_df['date'], infer_datetime_f

In [132]:
from stockinfo import collect_all_features

dataset = {}

# For each row in df, collect all features during that time period
for index, row in earnings_calls_with_sentiment_df.iterrows():
    try:
        ticker = row['ticker']
        start_date = row['start_date']
        prediction_date = row['prediction_date']

        features = collect_all_features(ticker, start_date, prediction_date, None, None, None)
        features['Earnings_Call_Sentiment'] = row['sentiment_intensity']
        features['sentiment_mean_neg_prob'] = row['mean_neg_prob']
        features['sentiment_mean_pos_prob'] = row['mean_pos_prob']
        features['sentiment_mean_neu_prob'] = row['mean_neu_prob']
        features.dropna(inplace=True)
        dataset[row['sentiment_ticker']] = features
    except:
        continue

Now we shape it into a single numpy array in sequnec format (using `prepare_sequence_data`) and save it.

In [134]:
target_column = 'Close'
target_idx = 3
look_back = 10
look_forward = 5
processed_data = {}

for id, data in dataset.items():
    X, y = time_series.prepare_sequence_data(data.values, look_back, look_forward, target_idx)
    if len(X.shape) != 3:
        print(f"Skipping {ticker} (unexpected X shape: {X.shape})")
        continue
    processed_data[id] = {'X': X, 'y': y}

In [135]:
import numpy as np
test_data = processed_data.popitem()
X_test = test_data[1]['X']
y_test = test_data[1]['y']

TRAINING_DATA_DIR = '../../models/training_data'

np.save(f'{TRAINING_DATA_DIR}/motley_X_test.npy', X_test)
np.save(f'{TRAINING_DATA_DIR}/motley_y_test.npy', y_test)

X_combined = np.vstack([seq['X'] for seq in processed_data.values()])
y_combined = np.concatenate([seq['y'] for seq in processed_data.values()])

np.save(f'{TRAINING_DATA_DIR}/motley_X_combined.npy', X_combined)
np.save(f'{TRAINING_DATA_DIR}/motley_y_combined.npy', y_combined)