# Spark API Exercises

1. Create a spark data frame that contains your favorite programming languages.

     > 1. The name of the column should be language  
     > 2. View the schema of the dataframe  
     > 3. Output the shape of the dataframe  
     > 4. Show the first 5 records in the dataframe  

In [None]:
import pandas as pd

import pyspark
from pyspark.sql.functions import *

# create spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

> A. Create dataframe of languages with 1 column named language

In [None]:
pd_df = pd.DataFrame({
    "language": ["r", "python", "sql", "julia", "scala", "php", "html", "css", "javascript"]
})
df = spark.createDataFrame(pd_df)

> B. View the schema

In [None]:
df.printSchema()

> C. The shape of the dataframe

In [None]:
print("DataFrame shape: ", df.count(), " x ", len(df.columns))

> D. The first 5 records

In [None]:
df.show(5)

2. Load the mpg dataset as a spark dataframe.  

    > A. Create 1 column of output that contains a message like the following for each vehicle: *The 1999 audi a4 has a 4 cylinder engine.*  
    > B. Transform the trans column so that it only contains either manual or auto.


In [None]:
import pydataset

mpg = spark.createDataFrame(pydataset.data("mpg"))
mpg.show(5)

> A. Create 1 column of output that contains a message like the following for each vehicle: *The 1999 audi a4 has a 4 cylinder engine.*  

In [None]:
mpg.select(
    concat(
        lit("The "),
        col("year"),
        lit(" "),
        col("manufacturer"),
        lit(" "),
        col("model"),
        lit(" has a "),
        col("cyl"),
        lit(" cylinder engine."),
    ).alias("vehicle_cylinder_desc")
).show(truncate=False)

> B. Transform the trans column so that it only contains either manual or auto. This can be done in many ways. We will demonstrate it using regexp_extract, regexp_replace, and when. 

In [None]:
mpg.show(10)

In [None]:
# multiple ways to do this, here's 3 of them
mpg.select(
    'trans',
    regexp_extract("trans", r"^(\w+)\(", 1).alias("regexp_extract"),
    regexp_replace("trans", r"\(.+$", "").alias("regexp_replace"),
    when(
        mpg.trans.like("auto%"), "auto"
    ).otherwise("manual").alias("when + like")
).show()

3. Load the tips dataset as a spark dataframe.

    > A. What percentage of observations are smokers?  
    > B. Create a column that contains the tip percentage  
    > C. Calculate the average tip percentage for each combination of sex and smoker.  

In [None]:
# Load the tips dataset

tips = spark.createDataFrame(pydataset.data("tips"))
tips.show(5)

> A. What percentage of observations are smokers? 

In [None]:
tips.groupBy("smoker").count().show()

In [None]:
tips.groupBy("smoker").count().withColumn(
    "percent",
    concat(round((col("count") / tips.count() * 100), 0).cast("int"), lit("%")),
).show()

> Create a column that contains the tip percentage

In [None]:
tips.withColumn("tip_percentage", col('tip') / col('total_bill')).show()

> Calculate the average tip percentage for each combination of sex and smoker.

In [None]:
(
    tips.withColumn("tip_percentage", col('tip') / col('total_bill'))
    .groupby("sex")
    .pivot("smoker") # make a pivot table
    .agg(round(mean("tip_percentage"), 4))
    .show()
)

In [None]:
# Alternate syntax with backslashes
tips.withColumn("tip_percentage", col('tip') / col('total_bill'))\
    .groupby("sex")\
    .pivot("smoker")\
    .agg(round(mean("tip_percentage"), 4))\
    .show()

4. Use the seattle weather dataset referenced in the lesson to answer the questions below.  

    > A. Convert the temperatures to farenheight.  
    > B. Which month has the most rain, on average?  
    > C. Which year was the windiest?  
    > D. What is the most frequent type of weather in January?  
    > E. What is the average high and low tempurature on sunny days in July in 2013 and 2014?  
    > F. What percentage of days were rainy in q3 of 2015?  
    > G. For each year, find what percentage of days it rained (had non-zero precipitation).  

In [None]:
from vega_datasets import data

weather = data.seattle_weather()
weather = spark.createDataFrame(weather)
weather.show(4)

> A. Convert temperatures from c to f:  (0°C × 9/5) + 32 = 32°F

In [None]:
# pandas equivalent -- df.temp_max = df.temp_max * 9 / 5 + 32

weather = weather.withColumn(
    "temp_max", (col("temp_max") * 9 / 5 + 32)
).withColumn("temp_min", (col("temp_min") * 9 / 5 + 32))

In [None]:
weather.show(4)

> B. Which month has the most rain, on average? 

In [None]:
row = (
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)
row

> C. Which year is the windiest? 

In [None]:
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .head(5)
)

> D. What is the most frequent type of weather in january? 

In [None]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupBy("weather")
    .count()
    .sort(col("count").desc())
    .show()
)

> E. What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [None]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show()
)

> F. What percentage of days were rainy in q3 of 2015?

In [None]:
# in pandas -- (df.weather == "rain").mean()
# measure a rainy day by weather == rain
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

> G.  For each year, find what percentage of days it rained (had non-zero precipitation).

In [None]:
# measure a rainy day by precipitation > 0
(
    weather.withColumn("year", year("date"))
    .select(when(col("precipitation") > 0, 1).otherwise(0).alias("did_rain"), "year")
    .groupby("year")
    .agg(mean("did_rain"))
    .show()
)