In [12]:
import pydataset
from vega_datasets import data
import pyspark
import pandas as pd

from pyspark.sql.functions import *

In [10]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be `language`
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe

In [13]:
df = spark.createDataFrame(pd.DataFrame({
    'language': ['smalltalk', 'fortran', 'english', 'visual basic', 'coffeescript', 'cobol', 'excel???', 'power point']
}))

In [15]:
df.printSchema()

root
 |-- language: string (nullable = true)



In [16]:
print('dataframe shape: ', df.count(), 'x', len(df.columns))

dataframe shape:  8 x 1


In [17]:
df.show(5)

+------------+
|    language|
+------------+
|   smalltalk|
|     fortran|
|     english|
|visual basic|
|coffeescript|
+------------+
only showing top 5 rows



2. Load the `mpg` dataset as a spark dataframe.

    1. Create 1 column of output that contains a message like the one below:

            The 1999 audi a4 has a 4 cylinder engine.

        For each vehicle.

    1. Transform the `trans` column so that it only contains either `manual` or `auto`.

In [21]:
mpg = spark.createDataFrame(pydataset.data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [24]:
mpg.select(concat(
    lit('The '),
    col('year'),
    lit(' '),
    col('manufacturer'),
    lit(' '),
    col('model'),
    lit(' has a '),
    col('cyl'),
    lit(' cylinder engine'),
).alias('vehicle_cylinder_description')).show(truncate=False)

+-------------------------------------------------------------+
|vehicle_cylinder_description                                 |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 2008 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylind

In [29]:
mpg.select(
    regexp_extract('trans', r'^(\w+)\(', 1).alias('trans'),
    when(mpg.trans.like('auto%'), 'auto').otherwise('manual'),
).show()

+------+----------------------------------------------------+
| trans|CASE WHEN trans LIKE auto% THEN auto ELSE manual END|
+------+----------------------------------------------------+
|  auto|                                                auto|
|manual|                                              manual|
|manual|                                              manual|
|  auto|                                                auto|
|  auto|                                                auto|
|manual|                                              manual|
|  auto|                                                auto|
|manual|                                              manual|
|  auto|                                                auto|
|manual|                                              manual|
|  auto|                                                auto|
|  auto|                                                auto|
|manual|                                              manual|
|  auto|

3. Load the `tips` dataset as a spark dataframe.

    1. What percentage of observations are smokers?
    1. Create a column that contains the tip percentage
    1. Calculate the average tip percentage for each combination of sex and smoker.

In [31]:
tips = spark.createDataFrame(pydataset.data('tips'))
tips.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [33]:
total_n = tips.count()

tips.groupBy('smoker').count().select('*', (col('count') / total_n).alias('percent')).show()

+------+-----+-------------------+
|smoker|count|            percent|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



In [43]:
tips.select(
    when(col('smoker') == 'Yes', 1).otherwise(0).alias('is_smoker')
).agg(mean('is_smoker')).show()

+-------------------+
|     avg(is_smoker)|
+-------------------+
|0.38114754098360654|
+-------------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    - Convert the temperatures to farenheight.
    - Which month has the most rain, on average?
    - Which year was the windiest?
    - What is the most frequent type of weather in January?
    - What is the average high and low tempurature on sunny days in July in 2013 and 2014?
    - What percentage of days were rainy in q3 of 2015?
    - For each year, find what percentage of days it rained (had non-zero precipitation).

In [44]:
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(4)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 4 rows



In [48]:
weather = (weather
 .withColumn('temp_max', col('temp_max') * 9 / 5 + 32)
 .withColumn('temp_min', col('temp_min') * 9 / 5 + 32))

In [52]:
(weather
 .withColumn('month', month('date'))
 .groupBy('month')
 .agg(mean('precipitation').alias('avg_monthly_rain'))
 .sort('month')
 .show())

+-----+-------------------+
|month|   avg_monthly_rain|
+-----+-------------------+
|    1| 3.7580645161290316|
|    2|  3.734513274336283|
|    3|  4.888709677419355|
|    4|  3.128333333333333|
|    5| 1.6733870967741935|
|    6| 1.1075000000000002|
|    7|0.38870967741935486|
|    8| 1.3201612903225806|
|    9| 1.9624999999999997|
|   10|  4.059677419354839|
|   11|  5.354166666666667|
|   12|  5.021774193548389|
+-----+-------------------+



In [53]:
(weather
 .withColumn('year', year('date'))
 .groupBy('year')
 .agg(sum('wind'))
 .show())

+----+------------------+
|year|         sum(wind)|
+----+------------------+
|2015|1153.3000000000002|
|2013|1100.8000000000006|
|2014|1236.5000000000007|
|2012|            1244.7|
+----+------------------+



In [54]:
# What is the most frequent type of weather in January?
(weather
 .withColumn('month', month('date'))
 .filter(expr("month == 1"))
 .groupBy('weather')
 .count()
 .show())

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|drizzle|   10|
|   rain|   35|
|    sun|   33|
|   snow|    8|
+-------+-----+



In [59]:
# What percentage of days were rainy in q3 of 2015?
(weather
 .withColumn('quarter', quarter('date'))
 .withColumn('year', year('date'))
 .filter(expr("year = 2015"))
 .filter(expr("quarter = 3"))
 .select(when(col('precipitation') > 0, 1).otherwise(0).alias('was_rainy'))
 .agg(mean('was_rainy'))
 .show())

+-------------------+
|     avg(was_rainy)|
+-------------------+
|0.18478260869565216|
+-------------------+

