# Lab 8: Exercises for Spark Streaming

In [None]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
import pyspark.sql.functions as F
from pyspark import SparkConf

conf = SparkConf().setAppName("lab8_exercise")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

### Q1

Data is in `data/flights/departuredelays.csv`

(1) Calculate the average and approximate median number of the `delay` column. 

(2) Group the data by `origin` and `destination` columns. Filter out the rows where their average delay time is less than or equal to zero. 

(3) Output the data with these three columns in the parquet format. 

In [None]:
## code here

df = spark.read.format("csv").option("header", "true")  \
  .option("inferSchema", "true").load("/shareddata/data/flights/departuredelays.csv")

df.show(5)
df.printSchema()
pdf = df.pandas_api()
print(f"The average value of delay column is {pdf['delay'].mean()}") 
print(f"The approximate median value of delay column is {pdf['delay'].median()}")

In [None]:
df1 = df.groupBy('origin','destination').agg(F.mean('delay').alias('avg_delay')).filter(F.col('avg_delay') > 0).sort(F.desc('avg_delay'))
df1.show(5)

In [None]:
df1.write.format("parquet").mode("overwrite").save("/shareddata/data/flights/avg_delay.parquet")  ## change the save path to your local path

### Q2

The dataframe is already created. 

(1) Group the data by `state` column. Get the sum of `salary` column with its name as `sum_salary`. (use two methods)

(2) Filter the `sum_salary` to get those who is larger than 100000. Rank the filtered `sum_salary` in descending order. (use both dataframe API and SQL operations)


In [None]:
# from pyspark.sql.functions import col,sum,avg,max

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NV",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","DE",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","NV",80000,25,18000),
    ("Kumar","Marketing","NJ",91000,50,21000)
  ]

schema = "employee_name STRING, department STRING, state STRING, salary INT, age INT, bonus INT"
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(5)


In [None]:
## Part 1

# method 1
dfGroup = df.groupBy("state").sum("salary").withColumnRenamed("sum(salary)", "sum_salary")
dfGroup.show()


# method2
dfGroup=df.groupBy("state").agg(F.sum("salary").alias("sum_salary"))
dfGroup.show()


In [None]:
## Part 2

# Use dataframe 
dfFilter=dfGroup.filter(dfGroup.sum_salary > 100000)
dfFilter.show()

from pyspark.sql.functions import desc
dfFilter.sort(desc("sum_salary")).show()


df.groupBy("state") \
  .agg(F.sum("salary").alias("sum_salary")) \
  .filter(F.col("sum_salary") > 100000)  \
  .sort(desc("sum_salary")) \
  .show()


# use SQL 
df.createOrReplaceTempView("EMP")
spark.sql("select state, sum(salary) as sum_salary from EMP " +
          "group by state having sum_salary > 100000 " + 
          "order by sum_salary desc").show()


### Q3

Join two dataframes of `emp` and `dept`. (Inner join, use dataframe API and SQL)

In [None]:

emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)


dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)


In [None]:
## Use dataframe API  

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner") \
     .show(truncate=False)

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"outer") \
    .show(truncate=False)

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"full") \
    .show(truncate=False)

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"fullouter") \
    .show(truncate=False)
    
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"left") \
    .show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"leftouter") \
   .show(truncate=False)

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"right") \
   .show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"rightouter") \
   .show(truncate=False)

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"leftsemi") \
   .show(truncate=False)
   
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"leftanti") \
   .show(truncate=False)
   


In [None]:
# Use SQL
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
   
joinDF = spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \
  .show(truncate=False)

joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") \
  .show(truncate=False)

# END


Thank you 