# DataFrames

Starting to rock the world with Apache Spark 

## Create and Show a DF

In [None]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"])

In [None]:
df.count()

In [None]:
df.show()

## Select a column

In [None]:
#Returns Column Object
df.A

In [None]:
df.select('A').show()

## Add a new Column based on another

In [None]:
#Adding a new column
df.withColumn('C',df.A+1).show()

## Add a new Column with constant values

In [None]:
from pyspark.sql.functions import lit
df.withColumn('C',lit(5)).show()

## Filter columns

In [None]:
df.select('A',(df.A > 2).alias("State")).show()

In [None]:
df[(df.A > 2)].show()

## GroupBy

In [None]:
df = spark.createDataFrame([('a',33), ('b',11), ('a',22)],['names','age'])

In [None]:
df.show()

In [None]:
gdf = df.groupBy(df.names)

In [None]:
gdf.agg({"*":"count"}).collect()

In [None]:
from pyspark.sql import functions as F
df = spark.createDataFrame([('a',33), ('b',11), ('a',22)],['names','age'])
gdf = df.groupBy(df.names)

sorted(gdf.agg(F.min(df.age)).collect())

In [None]:
g2df = df.groupBy(df.names)
g2df.min('age').collect()

### Generate your own DataFrame
Create `stringRDD` RDD and then convert it into a DataFrame when we're reading `stringJSONRDD` using `spark.read.json`.

In [None]:
# Generate our own JSON data 
string_JSON_RDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Argenis",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Liliana",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Ana",
    "age": 23,
    "eyeColor": "blue"
  }""")
)

In [None]:
# Create DataFrame
swimmers_JSON = spark.read.json(string_JSON_RDD)

In [None]:
# Create temporary table
swimmers_JSON.createOrReplaceTempView("swimmersJSON")

In [None]:
# DataFrame API
swimmers_JSON.show()

In [None]:
# SQL Query
spark.sql("select * from swimmersJSON").collect()

In [None]:
spark.sql("select * from swimmersJSON")

## Inferring the Schema Using Reflection
Note that Apache Spark is inferring the schema using reflection; i.e. it automaticlaly determines the schema of the data based on reviewing the JSON data.

In [None]:
# Print the schema
swimmers_JSON.printSchema()

Notice that Spark was able to determine infer the schema (when reviewing the schema using `.printSchema`).

But what if we want to programmatically specify the schema?

## Programmatically Specifying the Schema
In this case, let's specify the schema for a `CSV` text file.

In [None]:
from pyspark.sql.types import *

string_CSV_RDD = sc.parallelize([(123, 'Argenis', 19, 'brown'), (234, 'Liliana', 22, 'green'), (345, 'Ana', 23, 'blue')])

# The schema is encoded in a string, using StructType we define the schema using various pyspark.sql.types
schemaString = "id name age eyeColor"
schema = StructType([
    StructField("id", LongType(), True),    
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

# Apply the schema to the RDD and Create DataFrame
swimmers = spark.createDataFrame(string_CSV_RDD, schema)

# Creates a temporary view using the DataFrame
swimmers.createOrReplaceTempView("swimmers")

In [None]:
# Print the schema
#   Notice that we have redefined id as Long (instead of String)
swimmers.printSchema()

In [None]:
spark.sql("select * from swimmers")

As you can see from above, we can programmatically apply the `schema` instead of allowing the Spark engine to infer the schema via reflection.

Additional Resources include:
* [PySpark API Reference](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)
* [Spark SQL, DataFrames, and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html#programmatically-specifying-the-schema): This is in reference to Programmatically Specifying the Schema using a `CSV` file.

## Querying with dataframe

In [None]:
# Query id and age for swimmers with age = 22 via DataFrame API
swimmers.select("id", "age").filter("age = 22").show()

In [None]:
# Query id and age for swimmers with age = 22 via DataFrame API in another way
swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show()

In [None]:
# Query id and age for swimmers with age = 22 in SQL
spark.sql("select id, age from swimmers where age = 22").show()

In [None]:
spark.sql("select id, age from swimmers where age = 22")

In [None]:
# Query name and eye color for swimmers with eye color starting with the letter 'b'
spark.sql("select name, eyeColor from swimmers where eyeColor like 'b%'").show()

In [None]:
spark.sql("select name, eyeColor from swimmers where eyeColor like 'b%'")

## Querying with the DataFrame API
With DataFrames, you can start writing your queries using the DataFrame API

In [None]:
# Show the values 
swimmers.show()

In [None]:
# Get count of rows
swimmers.count()

In [None]:
# Get the id, age where age = 22
swimmers.select("id", "age").filter("age = 22").show()

In [None]:
# Get the name, eyeColor where eyeColor like 'b%'
swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show()

## DataFrame Queries
* Understanding explode, selectExpr

In [None]:
# import pyspark class Row from module sql
from pyspark.sql import *

# Create Example Data - Departments and Employees

# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')

# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('favio', 'vazquez', 'no-reply@iron-ai.com', 100000)
employee11 = Employee('favio', 'vazquez', 'no-reply@bbva.com', 200000)
employee2 = Employee('argenis', 'leon', 'no-reply@iron-ai.com', 300000)
employee3 = Employee('liliana', None, 'no-reply@iron-ai.com', 350000)
employee31 = Employee('liliana', None, 'no-reply@google.com', 180000)
employee4 = Employee(None, 'ferro', 'no-reply@iron-ai.com', 160000)

# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4, employee11])
departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4, employee31])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

In [None]:
departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]
df1 = spark.createDataFrame(departmentsWithEmployeesSeq1)

departmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]
df2 = spark.createDataFrame(departmentsWithEmployeesSeq2)

In [None]:
df1.show()

In [None]:
df2.show(truncate=False)

In [None]:
unionDF = df1.union(df2)

In [None]:
unionDF.show()

In [None]:
from pyspark.sql.functions import explode

df = unionDF.select("department",explode("employees").alias("e"))

In [None]:
df.show(truncate=False)

In [None]:
df.collect()

In [None]:
df.selectExpr("department.id","department.name","e.firstName", "e.lastName", "e.email", "e.salary").show()

In [None]:
from pyspark.sql.functions import explode

df = unionDF.select(explode("employees").alias("e"))

explodeDF = df.selectExpr("e.firstName", "e.lastName", "e.email", "e.salary")
explodeDF.show()

In [None]:
filterDF = explodeDF.filter( explodeDF.firstName == 'favio').sort(explodeDF.salary)

In [None]:
filterDF.show()

In [None]:
# Different ways of calling a column
from pyspark.sql.functions import *
filterDF = explodeDF.filter((filterDF.firstName == "favio") | (col("firstName") == "argenis")).sort(desc("lastName"))
filterDF.show()

In [None]:
whereDF = explodeDF.where((col("firstName") == "argenis") | (col("firstName") == "favio")).sort(asc("lastName"))
whereDF.show()

### Handling Missing Data

In [None]:
from pyspark.sql.functions import col, asc, desc
filterNonNullDF = explodeDF.filter(col("firstName").isNotNull()).filter(col("lastName").isNotNull()).sort("email")
filterNonNullDF.show()

In [None]:
from pyspark.sql.functions import countDistinct,count

countDistinctDF = explodeDF.select("firstName", "lastName")\
  .groupBy("firstName", "lastName")\
  .agg(countDistinct("firstName"))

countDistinctDF.show()

In [None]:
# Careful
from pyspark.sql.functions import count

countDistinctDF = explodeDF.select("firstName", "lastName")\
  .groupBy("firstName", "lastName")\
  .agg(count("*"))
countDistinctDF.show()

In [None]:
explodeDF.describe("salary").show()

For more information, please refer to:
* [Spark SQL, DataFrames and Datasets Guide](http://spark.apache.org/docs/latest/sql-programming-guide.html#sql)
* [PySpark SQL Module: DataFrame](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)
* [PySpark SQL Functions Module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions)

### DropDuplicates

In [None]:
df = spark.createDataFrame([
        (1, 144.5, 5.9, 33, 'M'),
        (2, 167.2, 5.4, 45, 'M'),
        (3, 124.1, 5.2, 23, 'F'),
        (4, 144.5, 5.9, 33, 'M'),
        (5, 133.2, 5.7, 54, 'F'),
        (3, 124.1, 5.2, 23, 'F'),
        (5, 129.2, 5.3, 42, 'M'),
    ], ['id', 'weight', 'height', 'age', 'gender'])
df.show()

In [None]:
df = df.dropDuplicates()
df.show()

In [None]:
df.count()

In [None]:
#Duplicates except for id column
df = df.dropDuplicates(subset=[c for c in df.columns if c != 'id'])

In [None]:
df.show()

In [None]:
[c for c in df.columns if c != 'id']

### Aggregation

In [None]:
import pyspark.sql.functions as F
df.agg(
  F.count('id').alias('count'),
  F.countDistinct('id').alias('distinct')
).show()

### More on Handling Missing Data

In [None]:
df_miss = spark.createDataFrame([
        (1, 143.5, 5.6, 28,   'M',  100000),
        (2, 167.2, 5.4, 45,   'M',  None),
        (3, None , 5.2, None, None, None),
        (4, 144.5, 5.9, 33,   'M',  None),
        (5, 133.2, 5.7, 54,   'F',  None),
        (6, 124.1, 5.2, None, 'F',  None),
        (7, 129.2, 5.3, 42,   'M',  76000),
    ], ['id', 'weight', 'height', 'age', 'gender', 'income'])

In [None]:
df_miss.show()

In [None]:
df_miss.printSchema()

In [None]:
df_miss.describe().show()

In [None]:
#Calculate missing columns for each row
df_miss.rdd.collect()

In [None]:
df_miss.where('id == 3').show()

In [None]:
import pyspark.sql.functions as F
df_miss.agg(
 F.count('weight'), F.count('height'), F.count('age'),F.count('gender'),F.count('income'),
 F.count('*')
).show()

In [None]:
import pyspark.sql.functions as F
df_miss.agg(
 *[F.count(c)  for c in df.columns]
).show()

In [None]:
df_miss.dropna(thresh=3).show()