In [None]:
                                                            # Structured APIs
# Chapter 4. Structured API Overview
'''
The Structured APIs are a tool for manipulating all sorts of data, from unstructured log files to semi-structured CSV files and highly structured Parquet files. 
These APIs refer to three core types of distributed collection APIs:
1. Datasets
2. DataFrames
3. SQL tables and views

Note: Spark is a distributed programming model in which the user specifies transformations. Multiple transformations build up a directed acyclic graph of instructions. 
An action begins the process ofexecuting that graph of instructions, as a single job, by breaking it down into stages and tasks to execute across the cluster. 
The logical structures that we manipulate with transformations and actions are DataFrames and Datasets.
To create a new DataFrame or Dataset, you call a transformation. To start computation or convert to native language types, you call an action.

DataFrames and Datasets: DataFrames and Datasets are (distributed) table-like (i.e. structured) collections with well-defined rows and columns.
Tables and views are basically the same thing as DataFrames. We just execute SQL against them instead of DataFrame code

Schemas: A schema defines the column names and types of a DataFrame. You can define schemas manually or read a schema from a data source (often called schema on read). Schemas consist of
types, meaning that you need a way of specifying what lies where.

Dataframe vs Datasets: DF is untyped, though, columns have types, but they are evaluated at run time, whereas for DS, should be defined first. Therefore, for SchemaOnRead type,
with only few records of data, there could be chance that data type may not accurately represent the run time value. say first 5 records on read of schema has all decimal values but later 
some char values. Data sets are only available in Scala and Java (JVM based). 

Spark has it own datatypes internally as its a programming language too. ex: ByteType, IntegerType, LongType,DateType etc.

Overview of Structured API Execution: 
1. Write DataFrame/Dataset/SQL Code.
2. If valid code, Spark converts this to a Logical Plan.
3. Spark transforms this Logical Plan to a Physical Plan, checking for optimizations along
the way.
4. Spark then executes this Physical Plan (RDD manipulations) on the cluster.

Execution: Upon selecting a physical plan, Spark runs all of this code over RDDs, the lower-level programming interface of Spark. Spark performs further
optimizations at runtime, generating native Java bytecode that can remove entire tasks or stages during execution. Finally the result is returned to the user.
'''

In [None]:
# Chapter 5. Basic Structured Operations

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkInJupyter") \
    .master("local[*]") \
    .getOrCreate()

spark

In [14]:
df = spark.read.format("json").load("./SparkBasics/data/flight-data/json/2015-summary.json")
spark.read.format("json").load("./SparkBasics/data/flight-data/json/2015-summary.json").schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [None]:

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])
df = spark.read.format("json").schema(myManualSchema)\
  .load("./data/flight-data/json/2015-summary.json")
df.show(1)


In [None]:

from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")


from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

In [None]:

from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

myRow[0]
myRow[2]

df = spark.read.format("json").load("./data/flight-data/json/2015-summary.json")
df.createOrReplaceTempView("dfTable")
df.show(1)

In [None]:

from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show(1)


In [None]:


df.select("DEST_COUNTRY_NAME").show(2)
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

from pyspark.sql.functions import expr, col, column
df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"))\
  .show(2)


df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))\
  .show(2)

df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

df.selectExpr(
  "*", # all original columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")
  .show(2)

df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

In [None]:

from pyspark.sql.functions import lit
df.select(expr("*"), lit(1).alias("One")).show(2)

df.withColumn("numberOne", lit(1)).show(2)

df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(2)

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME"))

dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`").show(2)

dfWithLongColName.select(expr("`This Long Column-Name`")).columns

In [None]:

df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

df.select("ORIGIN_COUNTRY_NAME").distinct().count()

seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count() # False

In [None]:

from pyspark.sql import Row
schema = df.schema
newRows = [
  Row("New Country", "Other Country", 5),
  Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [None]:
df.union(newDF)\
  .where("count = 1")\
  .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
  .show()

In [None]:

df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

from pyspark.sql.functions import desc, asc
df.orderBy(expr("count desc")).show(2)
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)

spark.read.format("json").load("./data/flight-data/json/*-summary.json")\
  .sortWithinPartitions("count")
df.limit(5).show()
df.orderBy(expr("count desc")).limit(6).show()

In [None]:
df.rdd.getNumPartitions() # 1
df.repartition(5)
df.repartition(col("DEST_COUNTRY_NAME"))
df.repartition(5, col("DEST_COUNTRY_NAME"))
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

In [None]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()
