In order to use dataframes and the SQL engine in Apache Spark (SparkSQL), we need a Spark Session.


In [50]:
import pyspark

In [51]:
from pyspark.sql import SparkSession

In [52]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [53]:
spark

## Spark Dataframes

A DataFrame is two-dimensional. Columns can be of different data types. DataFrames accept many data inputs including series and other DataFrames. You can pass indexes (row labels) and columns (column labels). Indexes can be numbers, dates, or strings/tuples.

### Read dataset as Spark dataframe from file

In [54]:
# Read Dataset as Spark dataframe from json file
df = spark.read.json('people.json')

In [55]:
df.show()
df.head(2)
df.printSchema()
df.dtypes

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
+---+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



[('age', 'bigint'), ('name', 'string')]

In [28]:
# Read Dataset as Spark dataframe from csv file
df = spark.read.option('header','true').csv('people.csv', inferSchema=True)

In [29]:
df.show()
df.head(2)
df.printSchema()
df.dtypes

+-------+---+
|   name|age|
+-------+---+
|Michael| 25|
|   Andy| 24|
| Justin| 19|
| George| 26|
|   Jeff| 30|
+-------+---+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



[('name', 'string'), ('age', 'int')]

In [17]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
| George|
|   Jeff|
+-------+



In [24]:
df.select(df["name"]).show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
| George|
|   Jeff|
+-------+



In [27]:
# filtering
df.filter(df["age"] > 21).show()

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 26| George|
| 30|   Jeff|
+---+-------+



In [28]:
# aggregation of data
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 26|    1|
| 19|    1|
| 25|    1|
| 30|    1|
| 24|    1|
+---+-----+



In [56]:
# add a column

from pyspark.sql.functions import length

df = df.withColumn('name_length', length(df['name']))
df.show()

+---+-------+-----------+
|age|   name|name_length|
+---+-------+-----------+
| 25|Michael|          7|
| 24|   Andy|          4|
| 19| Justin|          6|
| 26| George|          6|
| 30|   Jeff|          4|
+---+-------+-----------+



In [58]:
# drop a column

df = df.drop('name_length')
df.show()

+---+-------+
|age|   name|
+---+-------+
| 25|Michael|
| 24|   Andy|
| 19| Justin|
| 26| George|
| 30|   Jeff|
+---+-------+



In [45]:
spark.stop()