In [1]:
import requests
import requests.auth
import datetime as dt
import time

import pandas as pd

# api wrapper
from psaw import PushshiftAPI
api = PushshiftAPI()

import pprint
pp = pprint.PrettyPrinter(indent=4)

#natural language processing libraries
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
from nltk.probability import FreqDist
nltk.download('brown')
nltk.download('conll2000')

#textblob imports
from textblob import Blobber
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import FastNPExtractor
tb = Blobber(analyzer=NaiveBayesAnalyzer())

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/shujinkou/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shujinkou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/shujinkou/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/shujinkou/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


# Pushshift API wrapper query
    * automatic rate limiting
    * faster queries as it doesn't actually directly query reddit's servers
    * better searching

### Pushshift submission API request

In [3]:
# get all threads from the bitcoinmarket subreddit, gen is a generator 

start_epoch=int(dt.datetime(2013, 1, 1).timestamp())
gen = api.search_submissions(after=start_epoch,
        subreddit='bitcoinmarkets', #specify the subforum
        sort = 'asc',
            # filter=['url','author', 'title', 'subreddit'], # choose which dictionary keys to display
        limit=100000)

In [4]:
# iterate through the generator to create a submission array
# x.d_ is a method that converts each submission object to a dictionary
results = list(map(lambda x: x.d_, gen))

In [5]:
df = pd.DataFrame(results)
df.to_csv('bitcionmarkets_threads.csv')
df.to_json('')

### Pushshift comment API request

In [4]:
# searching for comments in the bitcoinmarkets subredidt

def query_comments(search, max_response_cache):
    """ Finds comments within a particular subreddit containing a phrase
    
    Keyword arguments:
    search (string) -- phrase to search for
    max_response_cache(int) -- maximum number of comments to retreive

    """
    gen = api.search_comments(q=search, subreddit='bitcoinmarkets')
    cache = []

    for c in gen:
        cache.append(c.d_)

        # Omit this test to actually return all results
        if len(cache) >= max_response_cache:
            break
            
    return cache

In [3]:
bullish = query_comments("bullish", 100000)
bearish = query_comments("bearish", 100000)
print(len(bullish))
print(len(bearish))

25400
15797


### Natural Language Processing. Sentiment and noun phrase extraction

In [2]:
def get_sentiment(comment):
    """ Returns texblob sentiment analysis for a comment's body
    
    Keyword arguments:
    comment (dict) -- element from a query_comments array
    """
    return TextBlob(comment['body']).sentiment

In [13]:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool()
# utalize all cores
# applies sentiment analysis to each comment
bullish_sentiments = pool.map(lambda x: tb(x['body']).sentiment, bullish)
#close the pool and wait for the work to finish
pool.close()
pool.join()

In [14]:
bullish_polarity = list(map(lambda x: x.p_pos, bullish_sentiments))

In [69]:
import numpy as np
# rargmin eturns comment with the lowest polarity
print(bullish[np.argmin(bullish_polarity)]['body'])

I'm insanely bullish about the price in the medium term and I'd love to leverage those gains up. But im a bit thick and even if I wasnt I dont think low leverage is a good idea at the moment. 60/40 seems crazy irresponsible. But I can get a lot of Yolos for that price even with randomly selected my way in I feel like I'm much more likely to come right. One random shot every week for the next few months at say 50 times seems very likely to come right at least once, and maybe right a lot of times. Of course the price actually needs to rise, but if it drops for months low leverage is dead anyway. I'm making excuses for greed, but I wonder if there is not a way to make this about as safe as holding alone. Say 10 percent fiat ashedge/rebut at the bottom. And a random leveraged punt every week for a year. So much

Upside, and the odds seem good? I'm a bit clueless though. Can still get stopped out I guess. What are the odds of Getting unlucky is the price is rising at a good rate? If I was d

In [67]:
dt.datetime.fromtimestamp(bearish[-1]['created_utc']).strftime("%m/%d/%Y, %H:%M:%S")

'04/21/2013, 12:01:00'

### example noun phrase extraction
* This can be used for a bubble word frequency chart, or a cooccurance graph

In [5]:
fomo = query_comments("FOMO", 1000)

In [6]:
# concatinate all the text in each comment into a single string to apply textblob
all_fomo = ' '.join(list(map(lambda x: x['body'], fomo)))

In [8]:
# there are two models for extracting noun phrases, Conll is faster, but less accurate
from textblob.np_extractors import ConllExtractor
#extractor = FastNPExtractor()
extractor = ConllExtractor()

# tb_noun is a "blobber" convience function that converts strings into textblob objects
tb_noun = Blobber(np_extractor=extractor)
df = pd.DataFrame(FreqDist(tb_noun(all_fomo).noun_phrases).most_common(50))
df = df.rename(columns = {0: "word", 1: "frequency"})
df.to_json('./bubble_chart/data/word_frequency.json', orient="records")