In [1]:
reset -f -s

In [2]:
whos

Interactive namespace is empty.


In [3]:
import json
import happybase
from os import listdir
import shutil

In [4]:
from data_gen import get_datum
import extract_data
from pyspark.sql.types import *
import pyspark

# Speed Layer

## HBase Storage

In [24]:
# Start HBase
! /Users/Alexander/hbase-1.1.2/bin/start-hbase.sh
# Start thrift server
! /Users/Alexander/hbase-1.1.2/bin/hbase-daemon.sh start thrift

starting master, logging to /Users/Alexander/hbase-1.1.2/bin/../logs/hbase-Alexander-master-Alexanders-MacBook-Pro.local.out
starting thrift, logging to /Users/Alexander/hbase-1.1.2/bin/../logs/hbase-Alexander-thrift-Alexanders-MacBook-Pro.local.out


In [23]:
# End HBase
! /Users/Alexander/hbase-1.1.2/bin/stop-hbase.sh
# Start thrift server
! /Users/Alexander/hbase-1.1.2/bin/hbase-daemon.sh stop thrift

stopping hbase.....................
stopping thrift.


In [25]:
connection = happybase.Connection(host = 'localhost')

In [17]:
# # delete all tables
# for name in table_names:
#     connection.disable_table(name)
#     connection.delete_table(name)

In [18]:
# family = {
#         'd': dict()   
# }

In [19]:
# # Only needs to be run once
# connection.create_table('user_name', family)
# connection.create_table('gender', family)
# connection.create_table('location', family)
# connection.create_table('review_edge', family)
# connection.create_table('purchase_edge', family)
# connection.create_table('item_name', family)

In [6]:
fact_labels = ["user_name",
              "gender",
              "location",
              "purchase_edge",
              "review_edge",
              "item_name" ]

In [5]:
# scan table
table = connection.table("item_name")
i = 0 
for _ in table.scan():
    i +=1
print i
    

## Upload data from Hbase into RDD

In [26]:
def get_userName_data(table):
    return [(data[0], data[1]["d:name"], data[1]["d:ts"]) for data in table.scan()]

In [27]:
def get_userGender_data(table):
    return [(data[0], data[1]["d:gender"], data[1]["d:ts"]) for data in table.scan()]

In [28]:
def get_userLocation_data(table):
    return [(data[0], data[1]["d:city"],data[1]["d:state"], data[1]["d:country"],data[1]["d:ts"]) for data in table.scan()]

In [29]:
def get_purchaseEdge_data(table):
    # userId,itemId,ts
    return [(data[0], data[1]["d:item_id"], data[1]["d:ts"]) for data in table.scan()]

In [30]:
def get_ratingEdge_data(table):
    # userId, itemdid, rating, ts, review
    return [(data[0], data[1]["d:item_id"], data[1]["d:rating"], data[1]["d:ts"], data[1]["d:review"]) for data in table.scan()]

In [31]:
def get_itemName_data(table):
    # userId, itemdid, rating, ts, review
    return [(data[0], data[1]["d:name"], data[1]["d:ts"],) for data in table.scan()]

In [77]:
fact_labels =  connection.tables()

### Transfer Data from Hbase to Spark RDDs

In [4]:
#create spark and sparksql contexts
sc = pyspark.SparkContext()
sqlContext = pyspark.HiveContext(sc)

In [34]:
def get_current_property(values):
    '''scans property time stampts and selects the most current property '''
    timestamp = -1
    timestamps = dict()
    
    for prop in values:
        timestamps[prop[timestamp]]=prop[:timestamp]
    max_ts = max(timestamps.keys())
    
    if len(timestamps[max_ts]) == 1:
        return (timestamps[max_ts][0],max_ts)
    elif len(timestamps[max_ts]) == 2:
        return (timestamps[max_ts][0],timestamps[max_ts][1],max_ts)
    else:
        return (timestamps[max_ts][0],timestamps[max_ts][1],timestamps[max_ts][2],max_ts)

In [98]:
    schema_gender = StructType( [
    StructField('user_id',StringType(),True),
    StructField('gender',StringType(),True),
    StructField('timestamp',StringType(),True),] )

    schema_location = StructType( [
    StructField('user_id',StringType(),True),
    StructField('city',StringType(),True),
    StructField('state',StringType(),True),
    StructField('country',StringType(),True),
    StructField('timestamp',StringType(),True),] )

    schema_name = StructType( [
    StructField('user_id',StringType(),True),
    StructField('name',StringType(),True),
    StructField('timestamp',StringType(),True),] )

    schema_review = StructType( [
    StructField('user_id',StringType(),True),
    StructField('item_id',StringType(),True),
    StructField('rating',StringType(),True),
    StructField('timestamp',StringType(),True),
    StructField('review',StringType(),True)] )

    schema_item_name = StructType( [
    StructField('item_id',StringType(),True),
    StructField('name',StringType(),True),
    StructField('timestamp',StringType(),True)] )

    schema_purchase = StructType( [
    StructField('user_id',StringType(),True),
    StructField('item_id',StringType(),True),
    StructField('timestamp',StringType(),True)] )

In [36]:
def get_names():  
    table = connection.table("user_name")
    user_name = sc.parallelize(get_userName_data(table))

    normalized_names = user_name.map(lambda (user_id, name, ts): (user_id, (name, ts)))\
                            .groupByKey()\
                            .mapValues(lambda line: get_current_property(line))\
                            .map(lambda line: (line[0], line[1][0], line[1][1]))
    return normalized_names

In [37]:
def get_genders():
    table = connection.table("gender")
    gender = sc.parallelize(get_userGender_data(table))

    normalized_genders = gender.map(lambda ( user_id, gender, ts): ( user_id, (gender, ts)))\
                                .groupByKey()\
                                .mapValues(get_current_property)\
                                .map(lambda line: (line[0], line[1][0],line[1][1]))
    return normalized_genders

In [38]:
def get_locations():
    table = connection.table("location")
    location = sc.parallelize(get_userLocation_data(table))

    normalized_locations = location.map(lambda ( user_id, city,state,country, ts): ( user_id, (city,state,country, ts)))\
                                  .groupByKey()\
                                  .mapValues(get_current_property)\
                                  .map(lambda line: (line[0], line[1][0],line[1][1],line[1][2],line[1][3]))
    return normalized_locations

In [39]:
def get_review_edges():
    table = connection.table("review_edge")
    review_edge = sc.parallelize(get_ratingEdge_data(table))

In [40]:
def get_purchase_edges():
    table = connection.table("purchase_edge")
    purchase_edge = sc.parallelize(get_purchaseEdge_data(table))

In [41]:
def get_item_names():
    table = connection.table("item_name")
    item_name = sc.parallelize(get_itemName_data(table))

## Run for updated batches

In [111]:
names = get_names()
genders = get_genders()
locations = get_locations()
review_edges = get_review_edges()
purchase_edge = get_purchase_edges()
item_names = get_item_names()

In [112]:
names.count()

914

In [74]:
names.take(3)

[('136639652273216189_2015_17_53', 'Perry Grace', '1445131894'),
 ('136639652273185269_2015_17_53', 'Derrick Derk', '1445375857'),
 ('136639652273111628_2015_17_53', 'Ferne Johnny', '1444956050')]

In [79]:
fact_labels

['gender',
 'item_name',
 'location',
 'purchase_edge',
 'review_edge',
 'user_name']

In [84]:
fact_labels = ['user_name', 'gender', 'location', 'purchase_edge', 'review_edge','item_name']

In [90]:
# user_id, gender, ts
gender = sqlContext.createDataFrame(genders, schema_gender)

In [91]:
# user_id, city,state,country, ts
location = sqlContext.createDataFrame(locations, schema_location)

In [94]:
# ID, name, ts
userName = sqlContext.createDataFrame(user_name, schema_name)

In [95]:
# user_id, item_id, rating, ts, review
review = sqlContext.createDataFrame(review_edge, schema_review)

In [96]:
# ID, name, ts
itemName = sqlContext.createDataFrame(item_name, schema_item_name)

In [3]:
# user, item,true_as_of_secs 
purchase = sqlContext.createDataFrame(purchase_edge, schema_purchase)

In [100]:
gender.registerTempTable("gender")

In [101]:
location.registerTempTable("location")

In [102]:
userName.registerTempTable("user_name")

In [103]:
review.registerTempTable("review")

In [104]:
itemName.registerTempTable("item_name")

In [101]:
purchase.registerTempTable("purchase")

In [106]:
sqlContext.sql("""SELECT user_name.name, gender.gender, location.city,location.state,location.country
                  FROM user_name
                  FULL OUTER JOIN gender
                  ON user_name.user_id = gender.user_id
                  FULL OUTER JOIN location
                  ON user_name.user_id = location.user_id
                  ORDER BY user_name.name
                  desc
                  """).show()

+-----------------+------+--------------+---------+-------+
|             name|gender|          city|    state|country|
+-----------------+------+--------------+---------+-------+
|    Zulema Torrey|  MALE|          null|     null|   null|
|   Zsazsa Heloise|  null|         44040|     Ohio|    USA|
|    Zonda Sigfrid|  MALE|          null|     null|   null|
|   Zitella Briana|  MALE|         63951| Missouri|    USA|
|       Zenia Lena|  MALE|         77624|    Texas|    USA|
|   Zahara Cherice|  null|   Grangeville|    Idaho|    USA|
|      Zack Deanne|FEMALE|          null|     null|   null|
|       Zack Arlyn|  null|         37338|Tennessee|    USA|
|        Yves Elna|  null|          null|     null|   null|
| Yovonnda Emeline|  null|         52630|     Iowa|    USA|
|    Yolane Traver|FEMALE|         85614|  Arizona|    USA|
|       Yoko Mayer|  null|          null|     null|   null|
|       Yard Dylan|  MALE|   Yuma County|  Arizona|    USA|
|  Yance Georgiana|  null|          null

In [1]:
sqlContext.sql("""SELECT location.country,location.state, location.city, 
                  COUNT (distinct location.user_id) AS users,
                  COUNT(purchase.item_id) AS purchases
                  FROM location
                  LEFT JOIN purchase
                  ON location.user_id = purchase.user_id
                  GROUP by location.country, location.state, location.city
                  ORDER BY location.state
                  """).show()