In [15]:
#from nltk.tokenize import word_tokenize
import json
import requests
import time
import os, os.path
from sys import stdout
import math

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

In [16]:
# returns python object representation of JSON in response
def get_response(symbol, older_than, retries=5):
    url = 'https://api.stocktwits.com/api/2/streams/symbol/%s.json?max=%d' % (symbol, older_than-1)
    for _ in range(retries):
        response = requests.get(url)
        if response.status_code == 200:
            return json.loads(response.content)
        elif response.status_code == 429:
            print response.content
            return None
        time.sleep(1.0)
    # couldn't get response
    return None

In [17]:
# extends the current dataset for a given symbol with more tweets
def get_older_tweets(symbol, num_queries):    
    path = './data/%s.json' % symbol
    if os.path.exists(path):
        # extending an existing json file
        with open(path, 'r') as f:
            data = json.load(f)
            if len(data) > 0:
                older_than = data[-1]['id']
            else:
                older_than = 1000000000000
    else:
        # creating a new json file
        data = []
        older_than = 1000000000000 # any huge number
    
    for i in range(num_queries):
        content = get_response(symbol, older_than)
        if content == None:
            print 'Error, an API query timed out'
            break
        data.extend(content['messages'])
        older_than = data[-1]['id']
        stdout.write('\rSuccessfully made query %d' % (i+1))
        stdout.flush()
        # sleep to make sure we don't get throttled
        time.sleep(0.5)
        
    # write the new data to the JSON file
    with open(path, 'w') as f:
        json.dump(data, f)
    print
    print 'Done'
    

In [18]:
# get some data
# apparently a client can only make 200 requests an hour, so we can't get all the data at once

# make data directory if needed
if not os.path.exists('./data'):
    os.mkdir('./data')
    
symbols = ['AAPL', 'TSLA', 'NVDA', 'AMD', 'JNUG', 'LABU']
tweets_per_symbol = 3000
for symbol in symbols:
    path = './data/%s.json' % symbol
    if os.path.exists(path):
        with open(path, 'r') as f:
            num_tweets = len(json.load(f))
    else:
        num_tweets = 0
    num_queries = (tweets_per_symbol - num_tweets - 1)/30 + 1
    if num_queries > 0:
        print 'Getting tweets for symbol %s' % symbol
        get_older_tweets(symbol, num_queries)

Getting tweets for symbol JNUG
{"response":{"status":429},"errors":[{"message":"Rate limit exceeded. Client may not make more than 200 requests an hour."}]}
Error, an API query timed out

Done
Getting tweets for symbol LABU
{"response":{"status":429},"errors":[{"message":"Rate limit exceeded. Client may not make more than 200 requests an hour."}]}
Error, an API query timed out

Done


In [35]:
# check that we're doing the querying and appending correctly without getting duplicates
# and that message IDs are in descending order
symbol = 'TSLA'
with open('./data/%s.json' % symbol, 'r') as f:
    data = json.load(f)
S = set()
old_id = 1000000000000
for message in data:
    message_id = message['id']
    assert message_id not in S
    assert message_id < old_id
    old_id = message_id
    S.add(message_id)
print 'Passed'

Passed


In [2]:
def get_tweets_and_labels(data):
    # filter out messages without a bullish/bearish tag
    data = filter(lambda m: m['entities']['sentiment'] != None, data)
    # get tweets
    tweets = map(lambda m: m['body'], data)
    # get labels
    def create_label(message):
        sentiment = message['entities']['sentiment']['basic']
        if sentiment == 'Bearish':
            return 0
        elif sentiment == 'Bullish':
            return 1
        else:
            raise Exception('Got unexpected sentiment')
    labels = map(create_label, data)
    return tweets, labels

In [3]:
# get all tweets and labels available
tweets = []
labels = []
all_tweets = []
for filename in os.listdir('./data'):
    path = './data/%s' % filename
    with open(path, 'r') as f:
        data = json.load(f)
    all_tweets.extend(map(lambda m: m['body'], data))
    t, l = get_tweets_and_labels(data)
    tweets.extend(t)
    labels.extend(l)
assert len(tweets) == len(labels)
print '%d labeled examples extracted ' % len(tweets)

4227 labeled examples extracted 


In [4]:
def tfidf_matrix(tweets, all_tweets=None):
    vectorizer = TfidfVectorizer()
    if all_tweets != None:
        # use all tweets, including unlabeled, to learn vocab and tfidf weights
        vectorizer.fit(all_tweets)
    else:
        vectorizer.fit(tweets)
    return vectorizer.transform(tweets)

In [5]:
def train_svm(X, y):
    model = svm.LinearSVC(penalty='l2', loss='hinge', C=1.0)
    #model = svm.SVC(C=1.0, kernel='rbf')
    model.fit(X, y)
    return model

In [14]:
X = tfidf_matrix(tweets, all_tweets)
y = np.array(labels)
print X.shape
print y.shape

N = X.shape[0]
num_train = int(math.floor(N*0.9))
P = np.random.permutation(N)
X_tr = X[P[:num_train]]
y_tr = y[P[:num_train]]
X_te = X[P[num_train:]]
y_te = y[P[num_train:]]
print 'Training set size is %d' % num_train
print 'Percent bullish = %f%%' % (100*y.mean())

(4227, 11691)
(4227,)
Training set size is 3804
Percent bullish = 76.271587%


In [247]:
model = train_svm(X_tr, y_tr)
print 'Training set accuracy = %f' % model.score(X_tr, y_tr)
print 'Test set accuracy = %f' % model.score(X_te, y_te)

Training set accuracy = 0.915615
Test set accuracy = 0.825059
