In [71]:
import os
import sys
import subprocess
import json
import re
import datetime



In [72]:
def setEnv(spark_home):    
    if not spark_home:
     raise ValueError('SPARK_HOME environment variable is not set')
    os.environ['SPARK_HOME']=spark_home
    sys.path.insert(0, os.path.join(spark_home, 'python'))
    sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

In [101]:
def getSC(master, appName='test', cores='10', mem='10g'):
    if not master:
        raise ValueError('master is not set')
    sparkEnv={"spark.cores.max":cores,
              "spark.driver.memory":"5g",
              "spark.executor.memory":mem}
    conf = SparkConf()
    conf.setMaster(master)
    conf.setAppName(appName)
    conf.setAll([(x,sparkEnv[x]) for x in sparkEnv])
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    
    return sc, sqlContext

In [74]:
setEnv('/opt/spark-1.4.3-bin-cdh4')
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf 
from pyspark.sql.types import *
from py4j.java_gateway import Py4JJavaError


In [75]:
sc, sqlContext=getSC('spark://bi-hd03:7077','dataframe',cores='80',mem='10g')

# import data from Json

In [76]:
df = sqlContext.read.json("hdfs://bi-hd01:9000/user/bryan/ratings_demo.json")

In [77]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- item: string (nullable = true)
 |    |-- ranking: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [78]:
df.show()

+---+-------+---------+
| ID|payload|timestamp|
+---+-------+---------+
|001|[301,5]| 20151201|
|002|[301,4]| 20151202|
|003|[302,2]| 20151202|
|004|[303,5]| 20151203|
|005|[303,5]| 20151203|
+---+-------+---------+



# Play with Json

In [127]:
df.filter('ID=001').show()

+---+-------+---------+
| ID|payload|timestamp|
+---+-------+---------+
|001|[301,5]| 20151201|
+---+-------+---------+



In [150]:
df.select(df.ID, df.payload.item.alias('item'), df.payload.ranking.alias('raking')).show()

+---+----+------+
| ID|item|raking|
+---+----+------+
|001| 301|     5|
|002| 301|     4|
|003| 302|     2|
|004| 303|     5|
|005| 303|     5|
+---+----+------+



In [151]:
df.select(df.payload.ranking).show()

+----------------+
|payload[ranking]|
+----------------+
|               5|
|               4|
|               2|
|               5|
|               5|
+----------------+



In [135]:
df.filter("payload.ranking<3").show()

+---+-------+---------+
| ID|payload|timestamp|
+---+-------+---------+
|003|[302,2]| 20151202|
+---+-------+---------+



# Generate a Json RDD

In [92]:
from pyspark.sql import Row
from pyspark.sql.functions import explode
eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"301": "5","302":"4"})])



In [93]:
eDF.show()

+-+--------------------+--------------------+
|a|             intlist|            mapfield|
+-+--------------------+--------------------+
|1|ArrayBuffer(1, 2, 3)|Map(301 -> 5, 302...|
+-+--------------------+--------------------+



In [94]:
eDF.printSchema()

root
 |-- a: long (nullable = true)
 |-- intlist: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- mapfield: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



# Amazing Explode

In [95]:
eDF.select(explode(eDF.intlist).alias("anInt")).show()

+-----+
|anInt|
+-----+
|    1|
|    2|
|    3|
+-----+



In [96]:
eDF_ex = eDF.select(eDF.a, explode(eDF.mapfield).alias("item","raking"))

In [97]:
eDF_ex.printSchema()

root
 |-- a: long (nullable = true)
 |-- item: string (nullable = false)
 |-- raking: string (nullable = true)



In [98]:
eDF_ex.show()

+-+----+------+
|a|item|raking|
+-+----+------+
|1| 301|     5|
|1| 302|     4|
+-+----+------+



# Save to Hdfs

In [64]:
eDF.repartition(1).write.save("hdfs://bi-hd01:9000/user/bryan/eDF.json",format='parquet',mode='overwrite')

In [65]:
!hadoop fs -ls /user/bryan/eDF.json

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/camus-example-0.1.0-SNAPSHOT-shaded.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/zookeeper/lib/slf4j-log4j12-1.6.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
Found 4 items
-rw-r--r--   2 bryan bryan          0 2015-12-02 19:32 /user/bryan/eDF.json/_SUCCESS
-rw-r--r--   2 bryan bryan        545 2015-12-02 19:32 /user/bryan/eDF.json/_common_metadata
-rw-r--r--   2 bryan bryan       1033 2015-12-02 19:32 /user/bryan/eDF.json/_metadata
-rw-r--r--   2 bryan bryan       1068 2015-12-02 19:32 /user/bryan/eDF.json/part-r-00000-68398d16-b54c-4a91-ba2d-e27c501c12b1.gz.parquet


In [66]:
eDF2 = sqlContext.read.parquet("hdfs://bi-hd01:9000/user/bryan/eDF.json")

In [67]:
eDF2.printSchema()

root
 |-- a: long (nullable = true)
 |-- intlist: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- mapfield: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [68]:
eDF2.show()

+-+--------------------+--------------------+
|a|             intlist|            mapfield|
+-+--------------------+--------------------+
|1|ArrayBuffer(1, 2, 3)|Map(301 -> 5, 302...|
+-+--------------------+--------------------+



In [99]:
sc.stop()