In [1]:
from cStringIO import StringIO

import fastavro

In [2]:
import pyspark

In [3]:
sc = pyspark.SparkContext()

URL normalization
---------------------

####Option 1: Upload pickle files into Spark

In [24]:
# load and distribute pickle files into spark
# create RDD
pickled_page_views = sc.pickleFile("../SuperWebAnalytics/master/page_view/")

In [38]:
pickled_page_views.take(1)

[{u'dataunit': {u'page_view': {u'nonce': 909462739,
    u'page': {u'url': u'http://mysite.com/'},
    u'person': {u'cookie': u'KLMNO'}}},
  u'pedigree': {u'true_as_of_secs': 1438381257}}]

####Option 2: Upload binary files into Spark

In [26]:
# load and distribute binary files into spark 
raw_page_views = sc.binaryFiles("../SuperWebAnalytics/master/page_view/*")

In [27]:
import fastavro
from cStringIO import StringIO

# transforms raw data into string object (?)
avro_page_views = raw_page_views.map(lambda t: StringIO(t[1]))

# deserializing avro data
json_page_views = avro_page_views.flatMap(fastavro.reader)

In [40]:
page_views = raw_page_views.flatMap(lambda x: x[1].split("\n")).filter(len)#.values().map(eval)

In [42]:
page_views.take(2)

['SEQ\x06!org.apache.hadoop.io.NullWritable"org.apache.hadoop.io.BytesWritable\x00\x00\x00\x00\x00\x00\x7f8\x11\xc3\x04\xe0\xa0\xa9\x0c\xeeXN\xf4"\x15o',
 'SEQ\x06!org.apache.hadoop.io.NullWritable"org.apache.hadoop.io.BytesWritable\x00\x00\x00\x00\x00\x00\x03\x8fP\xa3\xdc\x8b\xcc\xde\xa2\xbe\x85~\xf6\xf2S\x8e']

####This lab will use the RDD for pickle files 

In [35]:
from requests.utils import urlparse
def normalize_url(datum):
    pr = urlparse(datum['dataunit']['page_view']['page']['url'])
    datum['dataunit']['page_view']['page']['url'] = pr.hostname + pr.path
    return datum

###Explore normalize_url function

In [45]:
datum = pickled_page_views.take(1)

In [46]:
# isolate the data unit's content
datum[0]['dataunit']

{u'page_view': {u'nonce': 909462739,
  u'page': {u'url': u'http://mysite.com/'},
  u'person': {u'cookie': u'KLMNO'}}}

In [47]:
# isolate the page_view edge's content
datum[0]['dataunit']['page_view']

{u'nonce': 909462739,
 u'page': {u'url': u'http://mysite.com/'},
 u'person': {u'cookie': u'KLMNO'}}

In [48]:
# isolate the nest that contains the url
datum[0]['dataunit']['page_view']['page']

{u'url': u'http://mysite.com/'}

In [49]:
# isolate the url 
datum[0]['dataunit']['page_view']['page']['url']

u'http://mysite.com/'

In [53]:
from requests.utils import urlparse

In [56]:
# catagorize characters in url
urlparse(datum[0]['dataunit']['page_view']['page']['url'])

ParseResult(scheme=u'http', netloc=u'mysite.com', path=u'/', params='', query='', fragment='')

In [55]:
pr = urlparse(datum[0]['dataunit']['page_view']['page']['url'])

In [57]:
pr.hostname

u'mysite.com'

In [58]:
pr.path

u'/'

In [60]:
datum[0]['dataunit']['page_view']['page']['url'] = pr.hostname + pr.path

In [62]:
# This function re-formates the url 
# This output is returned by the function for each url passed in
datum[0]['dataunit']['page_view']['page']['url']

u'mysite.com/'

####Normalize url

In [36]:
normalized_page_views = pickled_page_views.map(normalize_url)

In [37]:
normalized_page_views.first()

{u'dataunit': {u'page_view': {u'nonce': 909462739,
   u'page': {u'url': u'mysite.com/'},
   u'person': {u'cookie': u'KLMNO'}}},
 u'pedigree': {u'true_as_of_secs': 1438381257}}

#####Unnormalized url

In [63]:
pickled_page_views.first()

{u'dataunit': {u'page_view': {u'nonce': 909462739,
   u'page': {u'url': u'http://mysite.com/'},
   u'person': {u'cookie': u'KLMNO'}}},
 u'pedigree': {u'true_as_of_secs': 1438381257}}

Deduplicate pageviews
---------------------

In [66]:
import simplejson as json
distinct_normalized_page_views = normalized_page_views.map(json.dumps)\ # serializes data
                                                      .distinct()\
                                                      .map(json.loads)  # deserializes data

In [65]:
distinct_normalized_page_views.first()

{'dataunit': {'page_view': {'nonce': 695934296,
   'page': {'url': 'mysite.com/blog'},
   'person': {'user_id': 3409}}},
 'pedigree': {'true_as_of_secs': 1438381343}}

Pageviews over time
---------------------

In [82]:
granularity = {'h': 60 * 60}
granularity['d'] = granularity['h'] * 24
granularity['w'] = granularity['d'] * 7
granularity['m'] = granularity['w'] * 4
granularity

{'d': 86400, 'h': 3600, 'm': 2419200, 'w': 604800}

####Explore how users are grouped into time blocks 
    ie. by hours, days, weeks, ...

In [68]:
print "There are {} seconds in 1 hour".format(60*60)

There are 3600 seconds in 1 hour


In [72]:
one_hr  = 3600 # seconds
ts = 1.5 * one_hr # let this be a ts for some user

In [73]:
ts % one_hr # this user is half way into the 2nd hour block 

1800.0

In [None]:
# 1 week hour range
# 1 hour (60 * 60)
# 1 day = 24 * 1 hour
# 1 week = 7 * 1 day
# fraction n * 1 week range = n * 1 week
# present time + n * 1 week

In [78]:
# hence, this user has been identified as being 
print 1438913491 - 1438913491 % (60*60)
print "This user has been identified as being {:.3} into an hour block".format(2800.0/3600)

1438912800
This user has been identified as being 0.778 into an hour block


In [83]:
def hourly(datum):
    url = datum['dataunit']['page_view']['page']['url']
    true_as_of_secs = datum['pedigree']['true_as_of_secs']
    return (url, 'h', true_as_of_secs - true_as_of_secs % granularity['h']), datum

In [94]:
datum[0]

{u'dataunit': {u'page_view': {u'nonce': 909462739,
   u'page': {u'url': u'mysite.com/'},
   u'person': {u'cookie': u'KLMNO'}}},
 u'pedigree': {u'true_as_of_secs': 1438381257}}

In [93]:
# the pedigree is reformated
hourly(datum[0]) # = (reformated pedigree, dataunit)

((u'mysite.com/', 'h', 1438380000),
 {u'dataunit': {u'page_view': {u'nonce': 909462739,
    u'page': {u'url': u'mysite.com/'},
    u'person': {u'cookie': u'KLMNO'}}},
  u'pedigree': {u'true_as_of_secs': 1438381257}})

####hourly function returns key_value tuple and datum

In [119]:
key_value ,dat = hourly(datum[0])

In [121]:
key_value

(u'mysite.com/', 'h', 1438380000)

In [122]:
dat

{u'dataunit': {u'page_view': {u'nonce': 909462739,
   u'page': {u'url': u'mysite.com/'},
   u'person': {u'cookie': u'KLMNO'}}},
 u'pedigree': {u'true_as_of_secs': 1438381257}}

In [84]:
# some how Spark knows to ignore the datum ?!?!?
hourly_rollup = distinct_normalized_page_views.map(hourly).countByKey()

In [111]:
hourly_rollup.items() # (key, value) pairs

[(('mysite.com/blog', 'h', 1438380000), 299527),
 (('mysite.com/', 'h', 1438380000), 299744)]

In [125]:
key, value = zip(*hourly_rollup.items()) # gives key value paris 

In [126]:
hourly_rollup.keys()

[('mysite.com/blog', 'h', 1438380000), ('mysite.com/', 'h', 1438380000)]

In [127]:
key # the keys 'mysite.com/blog' and 'mysite.com/' (with a timestampe)

(('mysite.com/blog', 'h', 1438380000), ('mysite.com/', 'h', 1438380000))

In [128]:
hourly_rollup.values()

[299527, 299744]

In [129]:
value # the values are the counts of the keys 

(299527, 299744)

In [130]:
hourly_rollup.keys()

[('mysite.com/blog', 'h', 1438380000), ('mysite.com/', 'h', 1438380000)]

In [131]:
import pandas as pd

####Display data in a table

In [132]:
pd.DataFrame({'count': c}, 
             index=pd.MultiIndex.from_tuples(i, names=('url', 'hour'))).reset_index()

ValueError: Length of names must match number of levels in MultiIndex.