In [80]:
import findspark
findspark.init()

In [117]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date
import time
from IPython.core.display import display, HTML
from string import Template
import json
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.tools.set_credentials_file(username='poying', api_key='--')

def date_converter(d):
    if isinstance(d, date):
        d = time.mktime(d.timetuple())
    return d

spark = SparkSession.builder \
                    .master("local[4]") \
                    .config("spark.driver.memory", "4g") \
                    .getOrCreate()

sc = spark.sparkContext

In [6]:
df = spark.read.load("../data/*.parquet")

In [7]:
def vout_types(li):
    d = set([])
    for i in li:
        d.add(i["type"])
    return list(d)

def tx_date(s):
    return date.fromtimestamp(s)

def count_types(types):
    counter = {}
    for li in types:
        for i in li:
            counter[i] = counter.get(i, 0) + 1
    return counter

df = df.withColumn("vout_types", udf(vout_types, returnType=ArrayType(StringType()))("vout"))
df = df.withColumn("date", udf(tx_date, returnType=DateType())("received_time"))
df = df.groupBy("date").agg(collect_list("vout_types").alias("vout_types_nested"))
df = df.select("date", "vout_types_nested").withColumn("types", udf(count_types, returnType=MapType(StringType(), IntegerType()))("vout_types_nested"))
df = df.select("date", "types")
df.head()

Row(date=datetime.date(2018, 9, 9), types={u'witness_v0_keyhash': 13282, u'witness_v0_scripthash': 470, u'nulldata': 7212, u'scripthash': 59246, u'pubkeyhash': 83032, u'nonstandard': 21, u'multisig': 3, u'pubkey': 138})

In [8]:
data = df.collect()

In [97]:
js_text_template = Template('''
{
let data = $python_data;
data = data.sort((a, b) => a[0] - b[0]);

const dates = data.map(d => new Date(d[0] * 1000));

let typeNames = data
  .map(d => Object.keys(d[1]))
  .reduce((a, b) => a.concat(b), []);
typeNames = Array.from(new Set(typeNames));

const types = typeNames.map(typeName => data.map(d => d[1][typeName] || 0));

IPython.notebook.kernel.execute('dates=' + JSON.stringify(dates));
IPython.notebook.kernel.execute('typeNames=' + JSON.stringify(typeNames));
IPython.notebook.kernel.execute('types=' + JSON.stringify(types));

}
''')

html_template = Template('''
<script> $js_text </script>
''')

js_text = js_text_template.substitute({'python_data': json.dumps(data, default=date_converter)})
HTML(html_template.substitute({'js_text': js_text}))

In [118]:
lines = [go.Scatter(x=dates, y=typ, mode='lines', name=typeNames[i]) for i, typ in enumerate(types)]
py.iplot(lines, filename='line-mode')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~poying/0 or inside your plot.ly account where it is named 'line-mode'
