In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

# 1. HOW TO CREATE DATAFRAME

In [2]:
first_df = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/cars")

first_df.show()
first_df.printSchema()

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|        8|       302.0|       140|

In [None]:
# specify a schema manually
cars_schema = StructType([
    StructField("Name", StringType()),
    StructField("Acceleration", DoubleType()),
    StructField("Cylinders", LongType()),
    StructField("Displacement", DoubleType()),
    StructField("Horsepower", IntegerType()),
    StructField("Miles_per_Gallon", DoubleType()),
    StructField("Origin", StringType()),
    StructField("Weight_in_lbs", LongType()),
    StructField("Year", StringType()),
])

#!!!!!!!!!!! Very Important Warning Long Type vs IntType

# reading a DF with a manual schema
cars_df = spark.read. \
    format("json"). \
    schema(cars_schema). \
    load("data/cars")

cars_df.filter(col("test") > 7)

cars_df.show()
# Data Frame comparing

# most_powered_df.explain(True)

# Parsed Plan = dont check fields, functions, expression
# Analyzed plan =
# Opt logical plan = 30-40 optimisations,
# Physical plan =

#     comparing
#  1 ==> first_df == first_df_inferSchema
#  2 ==> count1 union + distinct  first_df first_df_inferSchema, count1 = 10, count2 = 10,  count1 union (union all) + distinct = union = 20 or 10, antijoin
#  3 ==>

first_df_inferSchema = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/cars")

# first_df_inferSchema.printSchema()
# cars_df.printSchema()
# assert(first_df_inferSchema.schema == cars_df.schema)

# 3. Catalyst

In [None]:
#  DAG => Parsed Plan => Analyzed plan => Opt plan => Physical plan => Codegeneration
# Logical Plans
# Parsed Plan
# Analyzed plan
# Opt plan

most_powered_df = cars_df. \
    where(cars_df.Cylinders > 4). \
    withColumn("new", expr("Acceleration + 10")). \
    sort(cars_df.Horsepower.desc(), cars_df.Acceleration.asc())

most_powered_df.explain(True)

most_powered_df.show()

most_powered_df.queryExecution().debug().codegen()