In [1]:
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import os

In [2]:
sc = SparkContext()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 15:46:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark = SparkSession(sc)

### 1. Execute SQL query

#### 1.1. `createOrReplaceTemp` method
`createOrReplaceTemp`: create the temporary table or view for query.

In [4]:
csv_file_path = os.path.join('..', 'src-data', 'walmart_stock.csv')
schema_1 = spark.read.option('header', True).csv(csv_file_path)
schema_1.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



In [5]:
schema_1.createOrReplaceTempView('walmart_stock')

#### 1.2. `spark.sql(<query>)`

`SELECT`

In [6]:
query = 'SELECT Date, Volume FROM walmart_stock'

In [7]:
query_result_1 = spark.sql(query)
query_result_1.show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-06| 8069400|
|2012-01-09| 6679300|
+----------+--------+
only showing top 5 rows



`WHERE`

In [8]:
query = 'SELECT Date, Volume FROM walmart_stock WHERE Volume >= 9000000'

In [9]:
query_result_2 = spark.sql(query)
query_result_2.show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-19| 9234600|
|2012-01-20|10378800|
+----------+--------+
only showing top 5 rows



Use with `SUBSTRING(<col_name>, <position>, <length>)`

In [10]:
query = 'SELECT SUBSTRING(Date, 0, 4) AS Year, Volume FROM walmart_stock WHERE Volume >= 9000000'

In [11]:
query_result_3 = spark.sql(query)
query_result_3.show(5)

+----+--------+
|Year|  Volume|
+----+--------+
|2012|12668800|
|2012| 9593300|
|2012|12768200|
|2012| 9234600|
|2012|10378800|
+----+--------+
only showing top 5 rows



#### 2. Compare SQL vs DataFrame API

#### 2.1. Select

DataFrame API

In [12]:
schema_1.select(['Date', 'Volume']).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-06| 8069400|
|2012-01-09| 6679300|
+----------+--------+
only showing top 5 rows



SQL

In [13]:
query = 'SELECT Date, Volume FROM walmart_stock'
spark.sql(query).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-06| 8069400|
|2012-01-09| 6679300|
+----------+--------+
only showing top 5 rows



#### 2.2. Where

DataFrame API

In [14]:
schema_1.where(schema_1.Volume > 9000000).select(['Date', 'Volume']).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-19| 9234600|
|2012-01-20|10378800|
+----------+--------+
only showing top 5 rows



SQL

In [21]:
query = 'SELECT Date, Volume FROM walmart_stock WHERE Volume >= 9000000'
spark.sql(query).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2012-01-03|12668800|
|2012-01-04| 9593300|
|2012-01-05|12768200|
|2012-01-19| 9234600|
|2012-01-20|10378800|
+----------+--------+
only showing top 5 rows



#### 2.3. Like

In [15]:
schema_1.where(schema_1.Date.like('2013%')).select(['Date', 'Volume']).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2013-01-02|10390800|
|2013-01-03| 8910100|
|2013-01-04| 6438000|
|2013-01-07| 6201400|
|2013-01-08| 5866900|
+----------+--------+
only showing top 5 rows



In [16]:
query = 'SELECT Date, Volume FROM walmart_stock WHERE Date LIKE "2013%"'
spark.sql(query).show(5)

+----------+--------+
|      Date|  Volume|
+----------+--------+
|2013-01-02|10390800|
|2013-01-03| 8910100|
|2013-01-04| 6438000|
|2013-01-07| 6201400|
|2013-01-08| 5866900|
+----------+--------+
only showing top 5 rows



#### 2.4 Substring

DataFrame API

In [17]:
from pyspark.sql.functions import substring
schema_1.withColumn('year', substring('Date', pos = 0, len = 4)).show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+----+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|year|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|2012|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|2012|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|2012|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|2012|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|2012|
+----------+------------------+---------+---------+------------------+--------+------------------+----+
only showing top 5 rows



SQL

In [18]:
query = 'SELECT SUBSTRING(Date, 0, 4) AS Year, Volume FROM walmart_stock WHERE Volume >= 9000000'
spark.sql(query).show(5)

+----+--------+
|Year|  Volume|
+----+--------+
|2012|12668800|
|2012| 9593300|
|2012|12768200|
|2012| 9234600|
|2012|10378800|
+----+--------+
only showing top 5 rows

