# stock2vec

Create vectors for stocks based on their relative volatility.

In [1]:
import csv
import datetime
import math
import multiprocessing as mp
import numpy as np
import pandas as pd
import os
import random
import sys
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import time
from functools import partial
from tqdm import tqdm

# %config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
%matplotlib notebook

Load the diluted earnings per share by ticker.

In [2]:
CSV_URL = 'https://s3.amazonaws.com/perl-ml/prices.csv?response-content-disposition=attachment&X-Amz-Security-Token=FQoDYXdzECIaDLG1ZU6Yzztd7CsNGCKsAgNa3zgOVIw%2BQB8y%2FcRAMdAYK0ZPWW59OqVSuRuFGv3NEX3LapeZnns4VZleRraw1352r%2BP1CJm2hqgg2OlGcjf8pa414x90CDCdyIemO8HJwoIr4nKi18945ZmxthTL04BJsHD1MN0Tp%2F30A3kUMqscJP68vuQ75w098gKBJFxlnKztFUnP91Myn3%2FrrNUKQ%2F%2BODJx%2Bmpu7CMOGZlDLlSHtpTKbo8pULbHFGZAe%2BAvPqq0KU71nJ%2FWjUPcbLaEjSxOZl3%2BP98cePjijlMC8O6r9JzjTqGKUUUiqOWA92QZ6UtZfUlkyO%2BcNdLGltRJrCkGEctmyhJ6Qnim0eIfSBlzhDVPAtuAdTDrXzi2d3SGOJNm8P56ak71Vnk7P%2FSyGZsdQ9G0nMXBH1GeG5yjr7ebGBQ%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20170328T010700Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAJBTQPDQAOL557TLA%2F20170328%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=16a879624653ee25590a42768d975982001f3451249973af25e9d93942fec054'
FILE_NAME = 'input/prices.csv'
LOG_DIR = 'output'
MODEL_PATH = os.path.join(LOG_DIR, "model.ckpt")
STOCK_PATH = os.path.join(LOG_DIR,'stock.tsv')

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [3]:
from urllib.request import urlretrieve
from os.path import isfile, isdir

if not os.path.exists('input'):
    os.makedirs('input')

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(FILE_NAME):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Prices') as pbar:
        urlretrieve(CSV_URL, FILE_NAME, pbar.hook)

In [4]:
chunksize = 1000000
price_rows = 9191528

price_reader = pd.read_csv('input/prices.csv', 
                      header=None,
                      parse_dates=[1],
                      chunksize=chunksize, 
                      iterator=True)

df_prices = pd.DataFrame()

with tqdm(total=price_rows, desc='rows') as pbar:
    for chunk in price_reader:
        df_prices = df_prices.append(chunk)
        pbar.update(chunksize);

df_prices.columns = ['adj_close', 'date', 'ticker', 'epsdil', 'pe']

# Sort by date, then ticker
df_prices.sort_values(['date', 'pe'], inplace=True)

print(df_prices.head())

prices = df_prices['adj_close'].values.tolist()
dates = df_prices['date'].values.tolist()
tickers = df_prices['ticker'].values.tolist()
pes = df_prices['pe'].values.tolist()

rows: 10000000it [00:14, 683810.80it/s]                            


         adj_close       date ticker  epsdil         pe
45526    13.745073 2001-06-18   OLED   -0.87 -15.798935
3046709   6.147269 2001-06-18    YUM    0.69   8.909085
1778402  43.419875 2001-06-18    SWY    2.31  18.796483
315864   29.645033 2001-06-18    PEP    1.50  19.763356
1743245  11.213455 2001-06-18    SVU    0.47  23.858415


In [9]:
day = df_prices[(df_prices['date'] > datetime.date(year=2001,day=3,month=3)) & (df_prices['date'] < datetime.date(year=2002,day=3,month=3))]['pe'].tolist()

frequency = {}

for pe in day:
    rounded_pe = int(round(pe))
    freq = frequency.get(rounded_pe, 0)
    frequency[rounded_pe] = freq + 1

for k,v in frequency.items():
    frequency[k] = int(frequency[k] / 250)
    
plt.bar(list(frequency.keys()), frequency.values(), align='center')

plt.show()

<IPython.core.display.Javascript object>

## Build context

For each stock, find C stocks that have the closest volatility to that ticker for that day.

In [None]:
ticker_to_int = {}
int_to_ticker = {}

def get_ticker_int(idx):
    ticker = tickers[idx]
    key = ticker_to_int.get(ticker, None)
    if key is None:
        key = ticker_to_int[ticker] = len(ticker_to_int)
        int_to_ticker[key] = ticker
    return key

def get_window(idx, total, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R if (idx + R) < total else total

    stock_int = get_ticker_int(idx)
    stock_date = dates[idx]
    
    window = [] 
    
    for i in range(start, stop):
        nearby_stock_int = get_ticker_int(i)
        nearby_stock_date = dates[i]
        if nearby_stock_int != stock_int and nearby_stock_date == stock_date:
            window.append(nearby_stock_int)
    
    return window

for idx in range(0, 20, 9):
    print('window for', idx, tickers[idx], get_ticker_int(idx))
    for nearby_int in get_window(idx, len(tickers), 5):
        print(nearby_int, int_to_ticker[nearby_int])

In [None]:
batch_size = 10000
window_size = 10

total_prices = len(prices)

pbar = tqdm(total=int(total_prices / batch_size))

def get_batch(start):
    x, y = [], []

    stop = start + batch_size if (start + batch_size) < total_prices else total_prices

    for i in range(start, stop):
        batch_x = get_ticker_int(i)
        batch_y = get_window(i, total_prices, window_size)
        y.extend(batch_y)
        x.extend([batch_x]*len(batch_y))

    pbar.update();

    return [x, y]

def get_batches():
    batches = []
    
    for start in range(0, total_prices, batch_size):
        batches.append(get_batch(start))
   
    return batches

batches = get_batches()

In [None]:
# Save embedding metadata
with open(STOCK_PATH, 'w') as out:
  out.write('\n'.join(int_to_ticker.values()))

## Build the Graph

In [None]:
n_embedding = 400 # Number of embedding features 
n_stocks = len(df_prices['ticker'].unique())

train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    embedding = tf.Variable(tf.random_uniform((n_stocks, n_embedding), -1, 1), name='stock_embedding')
    embed = tf.nn.embedding_lookup(embedding, inputs)

# Negative sampling

In [None]:
# Number of negative labels to sample
n_sampled = 100

with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_stocks, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_stocks))
    
    # Calculate the loss using negative sampling
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      labels, embed,
                                      n_sampled, n_stocks)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [None]:
with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

# Training

In [None]:
epochs = 10

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs+1):
        start = time.time()
        for batch in batches:
            x = batch[0]
            y = batch[1]
            
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 100 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss/100),
                      "{:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
            
            if iteration % 10000 == 0:
                # note that this is expensive (~20% slowdown if computed every 500 steps)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_stock = int_to_ticker[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s:' % valid_stock
                    for k in range(top_k):
                        try:
                            close_stock = int_to_ticker[nearest[k]]
                            log = '%s %s,' % (log, close_stock)
                        except Exception:
                            print('nearest[k]', nearest[k])
                    print(log)
            
            iteration += 1
    save_path = saver.save(sess, MODEL_PATH)
    embed_mat = sess.run(normalized_embedding)

In [None]:
# Save the embedding for tensorboard

with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, MODEL_PATH)
    
    config = projector.ProjectorConfig()

    viz_embedding = config.embeddings.add()
    viz_embedding.tensor_name = embedding.name
    viz_embedding.metadata_path = STOCK_PATH
    summary_writer = tf.summary.FileWriter(LOG_DIR)
    projector.visualize_embeddings(summary_writer, config)

    saver.save(sess, MODEL_PATH)

In [None]:
from sklearn.manifold import TSNE

viz_stocks = 1000
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_stocks, :])

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
for idx in range(viz_stocks):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_ticker[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)