In [1]:
import pyspark
sc = pyspark.SparkContext() 
print sc

<pyspark.context.SparkContext object at 0x104041b10>


In [2]:
sqlContext = pyspark.HiveContext(sc)
print sqlContext

<pyspark.sql.context.HiveContext object at 0x104041890>


####Instantiate a `HiveContext()` and load the Yelp business data in using `jsonFile()`. 

In [3]:

yelp_business_schema_rdd =\
sqlContext.jsonFile('s3n://AKIAIBZEDBZIIV7PUW5Q:D6hXJTLH6B6SIv3ZYBRKuTgHQL23CLMthPmNl8EC@sparkdatasets/yelp_academic_dataset_business.json')



####Register the `yelp_business_schema_rdd` as a table named `yelp_business`.

In [4]:
from pyspark.sql.types import *

In [5]:
# yelp_business_schema_rdd
yelp_business_schema_rdd.registerTempTable('yelp_business')

In [6]:
# cache table to avoid long loadings from disk
sqlContext.cacheTable('yelp_business');

In [7]:
yelp_business_schema_rdd.columns

['attributes',
 'business_id',
 'categories',
 'city',
 'full_address',
 'hours',
 'latitude',
 'longitude',
 'name',
 'neighborhoods',
 'open',
 'review_count',
 'stars',
 'state',
 'type']

Write a query that returns the `name` of entries that fulfill the following conditions:
   - Rated at 5 `stars`
   - In the city of Phoenix
   - Accepts credit card (Reference the `Accept Credit Card` field by ````attributes.`Accepts Credit Cards`````
   - And is under the `Restaurants` category

###What does this statement do?
    attributes.`Accepts Credit Cards`
    
    when I select attributes, Accepts Credit Cards does not appear as an element
    Is Accepts Credit Cards an alias?
    If so, why do I have to use the ` symbol to declare the alias??????? 

In [8]:
sqlContext.sql("""select name,cate, attributes.`Accepts Credit Cards`, city, stars
                  from yelp_business
                  LATERAL VIEW explode(categories) c as cate
                  where stars = 5 
                  AND city = 'Phoenix'
                  AND cate = 'Restaurants'
                  AND attributes.`Accepts Credit Cards` = 'true'
                  """).show()

+--------------------+-----------+--------------------+-------+-----+
|                name|       cate|Accepts Credit Cards|   city|stars|
+--------------------+-----------+--------------------+-------+-----+
|       Auslers Grill|Restaurants|                true|Phoenix|  5.0|
|Mulligan's Restau...|Restaurants|                true|Phoenix|  5.0|
|             Sunfare|Restaurants|                true|Phoenix|  5.0|
|              Subway|Restaurants|                true|Phoenix|  5.0|
|           Lil Cal's|Restaurants|                true|Phoenix|  5.0|
|                Ed's|Restaurants|                true|Phoenix|  5.0|
|Frenchys Caribbea...|Restaurants|                true|Phoenix|  5.0|
|           WY Market|Restaurants|                true|Phoenix|  5.0|
|       Pollo Sabroso|Restaurants|                true|Phoenix|  5.0|
|Queen Creek Olive...|Restaurants|                true|Phoenix|  5.0|
|Gluten Free Creat...|Restaurants|                true|Phoenix|  5.0|
|Panini Bread and ..

##Part 2: Spark and SparkSQL in Practice 

Now we have a basic knowledge of how SparkSQL works, let's try dealing with a real-life scenario where some data manipulation is required in a regular Spark RDD before querying the data with SparkSQL.

<br>

Load the `user` and `transaction` datasets into 2 separate RDDs with the following code. 

   ```python
   user_rdd = sc.textFile('s3n://[YOUR ACCESS_KEY]:[YOUR SECRET_KEY]@sparkdatasets/users.txt')
   transaction_rdd = sc.textFile('s3n://[YOUR ACCESS_KEY]:[YOUR SECRET_KEY]@sparkdatasets/transactions.txt')
   ```

In [15]:
user_rdd = sc.textFile('s3n://AKIAIBZEDBZIIV7PUW5Q:D6hXJTLH6B6SIv3ZYBRKuTgHQL23CLMthPmNl8EC@sparkdatasets/users.txt')

In [16]:
transaction_rdd = sc.textFile('s3n://AKIAIBZEDBZIIV7PUW5Q:D6hXJTLH6B6SIv3ZYBRKuTgHQL23CLMthPmNl8EC@sparkdatasets/transactions.txt')

In [17]:
user_rdd.cache

<bound method RDD.cache of MapPartitionsRDD[21] at textFile at NativeMethodAccessorImpl.java:-2>

In [18]:
transaction_rdd.cache

<bound method RDD.cache of MapPartitionsRDD[23] at textFile at NativeMethodAccessorImpl.java:-2>

In [13]:
#NOTE: Some users have multiple phone numbers and multiple emails
# This must be delt with in the make rows function
user_rdd.take(50)

[u'1106214172;Prometheus Barwis;prometheus.barwis@me.com;(533) 072-2779',
 u'527133132;Ashraf Bainbridge;ashraf.bainbridge@gmail.com;',
 u'1290614884;Alain Hennesey;alain.hennesey@facebook.com,alain.hennesey@me.com;(942) 208-8460,(801) 938-2376',
 u'1700818057;Hamed Fingerhuth;hamed.fingerhuth@msn.com,hamed.fingerhuth@me.com;',
 u'17378782;Annamae Leyte;annamae.leyte@msn.com,annamae.leyte@facebook.com;',
 u'1723254379;Chao Peachy;chao.peachy@me.com,chao.peachy@gmail.com;(510) 121-0098',
 u'1946358537;Somtochukwu Mouritsen;somtochukwu.mouritsen@me.com;(669) 504-8080',
 u'33663453;Elisabeth Berry;elisabeth.berry@facebook.com;(802) 973-8267',
 u'1329323232;Jalan Blakely;jalan.blakely@gmail.com;',
 u'68524725;Lyric Boddy;lyric.boddy@yahoo.com;(273) 077-4039',
 u'629898066;Emilygrace Bossence;emilygrace.bossence@me.com;',
 u'1980300225;Warner Eddy;warner.eddy@gmail.com,warner.eddy@yahoo.com;(213) 586-6234,(618) 671-7611',
 u'1044067626;Kienan Drummond;kienan.drummond@aol.com,kienan.drummond

Each row in the `user` RDD represent the user with his/her `user_id, name, email, phone`. Each row in the 
   `transaction` RDDs has the columns  `user_id, amount_paid, date`. Map functions to the RDDs to make each row in 
   the RDDs a json **string** such as `{user_id: XXX, name: XXX, email:XXX, phone:XXX}` (use `json.dumps()`).

   **P.S.: Strip the `$` sign in the `amount_paid` column in the `transaction` RDD so it would be recognize as a   
   float when read into a SchemaRDD.**

In [62]:
def make_user_rows(line):
    # get all user info
    try:
        Id, name, email, phone = line.split(";")
    except ValueError:
        # try to get email
        try:
            Id, name, email = line.split(";")
            phone = "null"
        # if email not provided
        except ValueError:
            Id = line.split(";")[0]
            name = line.split(";")[1]
            email = "null"
            phone = "null"
    return {"user_id": int(Id), "name": str(name), "email":str(email), "phone":str(phone)}

In [40]:
def make_transaction_rows(line):
    Id, amount, date = line.split(";")
    return {"user_id": int(Id), "amount": float(amount.lstrip("$")), "data":str(date)}

In [63]:
user_json = user_rdd.map(lambda line: make_user_rows(line))

In [64]:
transaction_json = transaction_rdd.map(lambda line: make_transaction_rows(line))

In [23]:
user_json.take(1)

[{'email': 'prometheus.barwis@me.com',
  'name': 'Prometheus Barwis',
  'phone': '(533) 072-2779',
  'user_id': 1106214172}]

In [24]:
transaction_json.take(1)

[{'amount': 144.82, 'data': '2015-09-05', 'user_id': 815581247}]

Convert the `user` and `transaction` RDDs to SchemaRDDs. Print the schemas to make sure the conversion is 
   successful. Register the SchemaRDDs as separate tables and print the first couple of rows with SQL queries.
   
####We will use the term DataFrame to refer to both SchemaRDDs and DataFrames

In [65]:
df_trans = sqlContext.createDataFrame(transaction_json)
df_trans.show()

+------+----------+----------+
|amount|      data|   user_id|
+------+----------+----------+
|144.82|2015-09-05| 815581247|
|140.93|2014-03-11|1534673027|
|104.26|2014-05-06| 842468364|
| 194.6|2015-08-24|1720001139|
|307.72|2015-09-25|1397891675|
| 36.69|2014-10-24| 926282663|
| 39.59|2014-11-26| 694853136|
|430.94|2015-06-12| 636287877|
|  31.4|2014-12-05|1396310477|
|180.69|2015-03-26|1279939289|
|383.35|2014-06-06| 859061953|
| 256.2|2015-09-28|1983919868|
|930.56|2014-09-21| 589339046|
|423.77|2015-05-18|1559785598|
|309.53|2015-10-11| 347589978|
|299.19|2014-04-06| 963722938|
|426.21|2015-09-10|1808365853|
|732.27|2015-09-30| 417552135|
|186.33|2015-12-30| 744965566|
| 925.8|2014-10-06|1513020241|
+------+----------+----------+



In [66]:
df_user = sqlContext.createDataFrame(user_json)
df_user.show()

+--------------------+--------------------+--------------------+----------+
|               email|                name|               phone|   user_id|
+--------------------+--------------------+--------------------+----------+
|prometheus.barwis...|   Prometheus Barwis|      (533) 072-2779|1106214172|
|ashraf.bainbridge...|   Ashraf Bainbridge|                    | 527133132|
|alain.hennesey@fa...|      Alain Hennesey|(942) 208-8460,(8...|1290614884|
|hamed.fingerhuth@...|    Hamed Fingerhuth|                    |1700818057|
|annamae.leyte@msn...|       Annamae Leyte|                    |  17378782|
|chao.peachy@me.co...|         Chao Peachy|      (510) 121-0098|1723254379|
|somtochukwu.mouri...|Somtochukwu Mouri...|      (669) 504-8080|1946358537|
|elisabeth.berry@f...|     Elisabeth Berry|      (802) 973-8267|  33663453|
|jalan.blakely@gma...|       Jalan Blakely|                    |1329323232|
|lyric.boddy@yahoo...|         Lyric Boddy|      (273) 077-4039|  68524725|
|emilygrace.

In [67]:
df_trans.cache
df_user.cache

<bound method DataFrame.cache of DataFrame[email: string, name: string, phone: string, user_id: bigint]>

In [68]:
# register tables with SparkSQL context
df_user.registerTempTable("users")
df_trans.registerTempTable("trans")

Write a SQL query to return the names and the amount paid for the users with the **top 10** transaction amount.

In [69]:
result = sqlContext.sql("""SELECT users.name,
                           SUM(trans.amount) 
                           AS total 
                           FROM users 
                           JOIN trans 
                           ON users.user_id = trans.user_id 
                           GROUP BY users.name 
                           ORDER BY total 
                           DESC 
                           LIMIT 10""")

In [70]:
result.show()

+--------------------+------------------+
|                name|             total|
+--------------------+------------------+
|  Kashawn Macpherson|21945.300000000003|
|       Brysten Jeffs|          21773.51|
|      Martez Carlyle|21120.549999999996|
|         Jaivyn Hoks|20641.109999999997|
|       Bryanne Stopp|          20380.16|
|Leanthony Waldegrave|          20322.11|
| Roosevelt Gooderham|20230.059999999998|
|       Demont Howell|20172.169999999995|
|      Nasteha Bister|20163.909999999996|
|    Analaura Beetham|19998.190000000002|
+--------------------+------------------+

