In order to use dataframes and the SQL engine in Apache Spark (SparkSQL), we need a Spark Session.


In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

24/04/17 02:41:32 WARN Utils: Your hostname, elena-VB resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/04/17 02:41:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/17 02:41:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

## Spark Dataframes

A DataFrame is two-dimensional. Columns can be of different data types. DataFrames accept many data inputs including series and other DataFrames. You can pass indexes (row labels) and columns (column labels). Indexes can be numbers, dates, or strings/tuples.

### Read dataset as Spark dataframe from file

In [54]:
# Read Dataset as Spark dataframe from json file
df = spark.read.json('people.json')

In [55]:
df.show()
df.head(2)
df.printSchema()
df.dtypes

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
+---+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



[('age', 'bigint'), ('name', 'string')]

In [28]:
# Read Dataset as Spark dataframe from csv file
df = spark.read.option('header','true').csv('people.csv', inferSchema=True)

In [29]:
df.show()
df.head(2)
df.printSchema()
df.dtypes

+-------+---+
|   name|age|
+-------+---+
|Michael| 25|
|   Andy| 24|
| Justin| 19|
| George| 26|
|   Jeff| 30|
+-------+---+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



[('name', 'string'), ('age', 'int')]

In [17]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
| George|
|   Jeff|
+-------+



In [24]:
df.select(df["name"]).show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
| George|
|   Jeff|
+-------+



In [27]:
# filtering
df.filter(df["age"] > 21).show()

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 26| George|
| 30|   Jeff|
+---+-------+



In [28]:
# aggregation of data
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 26|    1|
| 19|    1|
| 25|    1|
| 30|    1|
| 24|    1|
+---+-----+



### Add and drop a column

In [56]:
# add a column

from pyspark.sql.functions import length

df = df.withColumn('name_length', length(df['name']))
df.show()

+---+-------+-----------+
|age|   name|name_length|
+---+-------+-----------+
| 25|Michael|          7|
| 24|   Andy|          4|
| 19| Justin|          6|
| 26| George|          6|
| 30|   Jeff|          4|
+---+-------+-----------+



In [58]:
# drop a column

df = df.drop('name_length')
df.show()

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
+---+-------+



In [45]:
spark.stop()

### Missing Values

In [24]:
df_missing = spark.read.option('header','true').csv('class_grades.csv', inferSchema=True)

In [25]:
df_missing.show(10)

+------+----------+--------+-------+--------+-----+
|Prefix|Assignment|Tutorial|Midterm|TakeHome|Final|
+------+----------+--------+-------+--------+-----+
|     5|     57.14|   34.09|  64.38|   51.48| 52.5|
|     8|     95.05|  105.49|   67.5|   99.07|68.33|
|     8|      83.7|   83.17|   30.0|   63.15|48.89|
|     7|     81.22|   96.06|  49.38|  105.93|80.56|
|     8|     91.32|   93.64|   95.0|  107.41|73.89|
|     7|      95.0|   92.58|  93.12|   97.78|68.06|
|     8|     95.05|  102.99|  56.25|   99.07| 50.0|
|     7|     72.85|   86.85|   60.0|    NULL|56.11|
|     8|     84.26|    93.1|   47.5|   18.52|50.83|
|     7|      90.1|   97.55|  51.25|   88.89|63.61|
+------+----------+--------+-------+--------+-----+
only showing top 10 rows



In [26]:
df_missing = df_missing.na.drop()

In [27]:
df_missing.show(10)

+------+----------+--------+-------+--------+-----+
|Prefix|Assignment|Tutorial|Midterm|TakeHome|Final|
+------+----------+--------+-------+--------+-----+
|     5|     57.14|   34.09|  64.38|   51.48| 52.5|
|     8|     95.05|  105.49|   67.5|   99.07|68.33|
|     8|      83.7|   83.17|   30.0|   63.15|48.89|
|     7|     81.22|   96.06|  49.38|  105.93|80.56|
|     8|     91.32|   93.64|   95.0|  107.41|73.89|
|     7|      95.0|   92.58|  93.12|   97.78|68.06|
|     8|     95.05|  102.99|  56.25|   99.07| 50.0|
|     8|     84.26|    93.1|   47.5|   18.52|50.83|
|     7|      90.1|   97.55|  51.25|   88.89|63.61|
|     7|     80.44|    90.2|   75.0|   91.48|39.72|
+------+----------+--------+-------+--------+-----+
only showing top 10 rows

