In [6]:
# Chris Clifford
# CS 613 - Final Project
# 12/10/20

In [7]:
import yaml
import pandas
import pytz
import requests
import csv
import numpy as np
import time
from datetime import datetime

In [8]:
# Config
with open(r'config/config.yaml') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)
    print(config['ameritrade-api'])

{'url': 'https://api.tdameritrade.com/v1'}


In [9]:
# Secret config
# Email crc339@drexel.edu for access
with open(r'config/config.secret.yaml') as file:
    secret = yaml.load(file, Loader=yaml.FullLoader)
    print(secret['news-api'])

{'key': 'd0d910b673284f7a8668ab9a3a12a664'}


In [15]:
DATA_FILE = 'data/analyst_ratings_processed.csv'
data = pandas.read_csv(DATA_FILE).to_numpy()
# Remove index column
data = data[:,1:]
data.shape

(1399178, 3)

In [16]:
data

array([['Stocks That Hit 52-Week Highs On Friday',
        '2020-06-05 10:30:00-04:00', 'A'],
       ['Stocks That Hit 52-Week Highs On Wednesday',
        '2020-06-03 10:45:00-04:00', 'A'],
       ['71 Biggest Movers From Friday', '2020-05-26 04:30:00-04:00',
        'A'],
       ...,
       ['UPDATE: Oppenheimer Color on China Zenix Auto Initiation',
        '2011-06-21 08:26:00-04:00', 'ZX'],
       ['Oppenheimer Initiates China Zenix At Outperform, $8 PT',
        '2011-06-21 05:59:00-04:00', 'ZX'],
       ['China Zenix Auto International Opens For Trading at $6.00; IPO Price Set at $6.00',
        '2011-05-12 09:36:00-04:00', 'ZX']], dtype=object)

In [17]:
np.random.seed(0)
np.random.shuffle(data)

In [19]:
# Remove news headlines that appear before/after market hours (US) or near open/close
# This helps us get cleaner data (we don't care about volatility spikes during open)
def during_market_hours(row):
    date_str = row[1]
    utc_time = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S%z').astimezone(pytz.utc)
    hr = utc_time.hour
    return 14 <= hr <= 19

mask = np.array([during_market_hours(d) for d in data])
data = data[mask]
data.shape

(525008, 3)

In [20]:
# Convert datetime to ms since epoch for Ameritrade API
def to_ms_since_epoch(date_str):
    try:
        EPOCH = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.UTC)

        utc_time = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S%z')
        return int((utc_time - EPOCH).total_seconds() * 1000)
    except TypeError or ValueError:
        print(date_str)
        print(f'Failed to convert datetime "{date_str}" to MS since epoch. Continuing...')

data = np.array([[row[0], to_ms_since_epoch(row[1]), row[2]] for row in data])
data.shape

(525008, 3)

In [21]:
# Returns (start - duration, start + duration)
def timeframe(start_ms, duration, duration_type):
    
    actual_duration_ms = 0
    
    if duration_type == 'ms':
        actual_duration_ms = duration
    elif duration_type == 'sec':
        actual_duration_ms = duration * 1000
    elif duration_type == 'min':
        actual_duration_ms = duration * 1000 * 60
    elif duration_type == 'hr':
        actual_duration_ms = duration * 1000 * 60 * 60
    elif duration_type == 'day':
        actual_duration_ms = duration * 1000 * 60 * 60 * 24
        
    end_ms = start_ms + actual_duration_ms
    
    return start_ms - actual_duration_ms, start_ms + actual_duration_ms

In [22]:
# Gets the volume data for the time period for a ticker
# Returns [] if there is some kind of error
def get_volume_data(start, end, ticker):
    MIN_DATE = 1585229820000
    
    if start < MIN_DATE:
        return []
    URL = config['ameritrade-api']['url']
    API_KEY = secret['ameritrade-api']['key']
    ENDPOINT = f'/marketdata/{ticker}/pricehistory'
    
    url = (f'{URL}'
           f'{ENDPOINT}?'
           f'apikey={API_KEY}&'
            'frequency=10&'
           f'startDate={start}&'
           f'endDate={end}')
       
    response = requests.get(url)
    json = response.json()
    
    if 'error' in json:
        # print(start, end, json['error'])
        return []
    
    d = np.array([(d['datetime'], d['volume']) for d in json['candles'] if start <= d['datetime'] < end])
    
    try:
        dates = d[:,[0]]
    except IndexError:
        # print('No data found for time frame.')
        return []
    
    vols = d[:,[1]]
    d_std = (vols - np.mean(vols)) / np.std(vols)
    time.sleep(.5)
    return np.concatenate((dates, d_std), axis=1)

LIMIT = 100000
TIME_SPAN = 1
# Get the volume data for each headline
vols = [(title, int(date), get_volume_data(timeframe(int(date), 1, 'hr')[0], timeframe(int(date), 1, 'hr')[1], ticker)) for title, date, ticker in zip(data[:LIMIT, 0], data[:LIMIT, 1], data[:LIMIT, 2])]
# Toss out bad data
volume_data = np.array([row for row in vols if len(row[2]) > 0])
volume_data.shape



(3228, 3)

In [23]:
# Identify the small, medium, and large volume spikes within the duration
def vol_spikes(vold, start, duration, duration_type):
    median = np.median(vold[:,1])
    stddev = np.std(vold[:,1])

    # print('Median: {:.3f}'.format(median))
    # print('Std Dev: {:.3f}'.format(stddev))

    end = timeframe(start, duration, duration_type)[1]

    tf_vold = vold[(start <= vold[:,0]) & (vold[:,0] <= end)]
    volumes = tf_vold[:,1]
    if len(volumes) == 0:
        return np.array([None, None, None])
    initial = volumes[0]

    upper = initial + stddev
    lower = initial - stddev
    spike65 = (volumes > upper).any() or (volumes < lower).any()
    #print('65%: {:.3f} - {:.3f}'.format(lower, upper))
    #print(f'Spike? {"Yes" if spike65 else "No"}')

    upper = initial + 2*stddev
    lower = initial - 2*stddev
    spike95 = (volumes > upper).any() or (volumes < lower).any()
    #print('95%: {:.3f} - {:.3f}'.format(lower, upper))
    #print(f'Spike? {"Yes" if spike95 else "No"}')

    upper = initial + 3*stddev
    lower = initial - 3*stddev
    spike99 = (volumes > upper).any() or (volumes < lower).any()
    #print('99%: {:.3f} - {:.3f}'.format(lower, upper))
    #print(f'Spike? {"Yes" if spike99 else "No"}')
    #print('')
    
    return np.array([1 if spike65 else 0, 1 if spike95 else 0, 1 if spike99 else 0], dtype=None)

temp = np.vstack([np.append(row[0], vol_spikes(row[2], row[1], 15, 'min')) for row in volume_data])
out_data = temp[temp[:,1] != None]



In [28]:
# Write the small, medium, and large volume spikes out to data files
out_data_with_headers = np.concatenate((np.array([['title', '65%', '95%', '99%']]), out_data))

out65 = out_data_with_headers[:,[0,1]]

OUT_FILE = 'data/processed65.csv'
with open(OUT_FILE, 'w', encoding='utf-8') as file:
    csv.writer(file).writerows(out65)

OUT_FILE = 'data/processed95.csv'
out95 = out_data_with_headers[:,[0,2]]
with open(OUT_FILE, 'w', encoding='utf-8') as file:
    csv.writer(file).writerows(out95)

OUT_FILE = 'data/processed99.csv'
out99 = out_data_with_headers[:,[0,3]]
with open(OUT_FILE, 'w', encoding='utf-8') as file:
    csv.writer(file).writerows(out99)

[["58 Stocks Moving In Wednesday's Mid-Day Session" '1' '0' '0']
 ['Shares of several airline companies are trading lower on continued weakness. The sector has been hurt by a decline in travel demand and Warren Buffett recently announced the sale of his holding in the space.'
  '1' '1' '1']
 ['Shares of several basic materials companies are trading higher as equities gain driven by a rebound in oil.'
  '0' '0' '0']
 ...
 ['Fitch Affirms Ratings On Deutsche Bank; Outlook Negative' '1' '0' '0']
 ['7 Starbucks Analysts On How The Coffee Giant Is Weathering A Pandemic'
  '0' '0' '0']
 ['Shares of several oil and gas companies are trading lower after API reported an 8.42 million barrel build in crude oil inventories.'
  '0' '0' '0']]
