## This notebook clears following concepts
- Basic Structure operations

In [1]:
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.types import StructField, StructType, StringType, LongType
from pyspark.sql.functions import *
from pyspark.sql import Row

In [2]:
spark = SparkSession.builder.appName("spark_definitive_guide").getOrCreate()

22/10/07 22:22:08 WARN Utils: Your hostname, AS-MAC-0006.local resolves to a loopback address: 127.0.0.1; using fd01:db8:1111:0:0:0:0:3 instead (on interface lo0)
22/10/07 22:22:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/07 22:22:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
manualSchema = StructType([
    StructField('DEST_COUNTRY_NAME', StringType(), True), 
    StructField('ORIGIN_COUNTRY_NAME', StringType(), True), 
    StructField('count', LongType(), False, metadata={"hello": "world"})
])

In [5]:
path = "./../src/flight-data/json/2015-summary.json"
df = spark.read.format('json').schema(manualSchema).load(path)

In [7]:
# constants

df.schema
df.columns



StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

### difference between select and selectExpr
- `select(*cols: ColumnOrName) -> DataFrame`
- `selectExpr(*expr: str) -> DataFrame`

In [34]:
# select and selectExpr

df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count')
df.select(expr("DEST_COUNTRY_NAME as dcount"))

df.selectExpr("*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as sameCountry") # using multiple exprs
df.selectExpr('avg(count)', 'count(distinct(DEST_COUNTRY_NAME))') # no need of import of count/avg functions here


dfWithLongColName = df.withColumn("This colName have spaces", expr("count"))
dfWithLongColName.select("This colName have spaces") # back tick not required and only string name is expected
dfWithLongColName.selectExpr("`This colName have spaces` * 10") # back tick required to separate string name and expression

df.withColumnRenamed("DEST_COUNTRY_NAME", "new_name_dest_country") # all cols with one col renamed
df.withColumnRenamed("DEST_COUNTRY_NAME" , "dest_new_name").withColumnRenamed("ORIGIN_COUNTRY_NAME", "orig_new_name")

DataFrame[(This colName have spaces * 10): bigint]

### Difference between filter and where
- `filter(condition: ColumnOrName) -> DataFrame`
- `where(condition: ColumnOrName) -> DataFrame`

Both are exactly the same and alias of one another. `where` is just for SQL people who want to use the same function name

In [49]:
df.filter(col("count") < 5)
df.where("count < 5")
df.where("count < 5").where("ORIGIN_COUNTRY_NAME == 'Croatia'") # usign multiple conditions one over the other

df.drop('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME') # returns another df, dont remove inplace

# distinct finds distinct based on the combination of all the columns mentioned
df.select('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME').distinct().count() # 256
df.select('DEST_COUNTRY_NAME').distinct().count() # 132
df.distinct().count() # 256

# sort and orderBy: Both are same, orderBy is alias of sort for SQL people
df.sort(expr("count desc"))
df.sort(col("count").asc(), col("ORIGIN_COUNTRY_NAME").desc())
df.orderBy(desc_nulls_last("ORIGIN_COUNTRY_NAME"), asc_nulls_first("DEST_COUNTRY_NAME"))
# Its better to use asc_nulls_first, asc_nulls_last, desc_nulls_first, desc_nulls_last... to better define position of null values

df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+



In [16]:
# Adding new Rows
schema = df.schema
newRows = [Row("New Country", "Other Country", 5), Row("New Country 2", "Other Country 2", 1)]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDf = spark.createDataFrame(parallelizedRows, schema)

df2 = df.union(newDf)

+-----------------+--------------------+-----+
|DEST_COUNTRY_NAME| ORIGIN_COUNTRY_NAME|count|
+-----------------+--------------------+-----+
|    United States|              Angola|   13|
|    United States|            Anguilla|   38|
|    United States| Antigua and Barbuda|  117|
|    United States|           Argentina|  141|
|    United States|               Aruba|  342|
|    United States|           Australia|  258|
|    United States|             Austria|   63|
|    United States|          Azerbaijan|   21|
|    United States|             Bahrain|    1|
|    United States|            Barbados|  130|
|    United States|             Belgium|  228|
|    United States|              Belize|  193|
|    United States|             Bermuda|  193|
|    United States|             Bolivia|   13|
|    United States|Bonaire, Sint Eus...|   59|
|    United States|              Brazil|  619|
|    United States|British Virgin Is...|   80|
|    United States|            Bulgaria|    1|
|    United S