In [17]:
import pyspark

import numpy as np
import pandas as pd
import pydataset

from pyspark.sql.functions import *

In [2]:
## make my spark session

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
spark

#### Create a spark data frame that contains your favorite programming languages.

* The name of the column should be language
* View the schema of the dataframe
* Output the shape of the dataframe
* Show the first 5 records in the dataframe

In [4]:
df = pd.DataFrame({'language': ['c++', 'c-sharp', 'python', 'go', 
      'java', 'javascript']}) ## making my pandas df

In [5]:
df.head() ## quality assurance check

Unnamed: 0,language
0,c++
1,c-sharp
2,python
3,go
4,java


In [6]:
df = spark.createDataFrame(df) ## make my spark dataframe

In [7]:
df.printSchema ## looking at the schema

<bound method DataFrame.printSchema of DataFrame[language: string]>

In [8]:
## looking at the shape

print(f' The amount of rows: {(df.count())}, \
The amount of columns: {len(df.columns)}')

 The amount of rows: 6, The amount of columns: 1


In [9]:
df.show(5) ## looking at the first five results

+--------+
|language|
+--------+
|     c++|
| c-sharp|
|  python|
|      go|
|    java|
+--------+
only showing top 5 rows



#### Load the mpg dataset as a spark dataframe.

 - Create 1 column of output that contains a message like the one below:

     - The 1999 audi a4 has a 4 cylinder engine.
     - For each vehicle.
     - Transform the trans column so that it only contains either manual or auto.

In [10]:
## loading the mpg pydataset into spark

mpg = pydataset.data('mpg')
df = spark.createDataFrame(mpg)
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [11]:
summary = concat(lit('The '), df.year, lit(' '), df.manufacturer, 
                 lit(' '), df.model, lit(' has a '), df.cyl,
                lit(' cylinder engine'))

df = df.select('*', summary.alias('summary'))

df.show() ## looking at question 2a

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+--------------------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|             summary|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+--------------------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|The 2008 audi a4 ...|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|The 2008 audi a4 ...|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  3.1|2008|  6|  auto(

In [12]:
df.select('summary').show(truncate = False) ## looking at our summary column

+-------------------------------------------------------------+
|summary                                                      |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 2008 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylind

In [13]:
df.select('trans').show(5) ## looking at our transmission column

+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
|  auto(av)|
|  auto(l5)|
+----------+
only showing top 5 rows



In [14]:
df.trans = df.select(df.trans, 
                     when((df.trans.contains('auto')), 'auto')
                     .otherwise('manual'))

In [15]:
df.select('trans').show()

+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
|  auto(av)|
|  auto(l5)|
|manual(m5)|
|  auto(av)|
|manual(m5)|
|  auto(l5)|
|manual(m6)|
|  auto(s6)|
|  auto(l5)|
|manual(m5)|
|  auto(s6)|
|manual(m6)|
|  auto(l5)|
|  auto(s6)|
|  auto(s6)|
|  auto(l4)|
|  auto(l4)|
+----------+
only showing top 20 rows



#### Load the tips dataset as a spark dataframe.

* What percentage of observations are smokers?
* Create a column that contains the tip percentage
* Calculate the average tip percentage for each combination of sex and smoker.

In [18]:
tips = pydataset.data('tips')
tips = spark.createDataFrame(tips)
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [None]:
## percentage of smokers

tips.where(tips.smoker == "Yes").count() / tips.select('smoker').count()

In [None]:
## making tip percentage column

col = tips.tip / tips.total_bill
col

In [None]:
## adding to dataframe 

tips = tips.select('*', col.alias('tip_pct'))

tips.select('tip_pct').show()

In [None]:
tips.show(5) ## checking it out

In [19]:
## recreating dataframe

tips = pydataset.data('tips')
tips = spark.createDataFrame(tips)
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [20]:
tips.groupBy("smoker").count().show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [21]:
tips.groupBy("smoker").count().withColumn(
    "percent",
    concat(round((col("count") / tips.count() * 100), 0).cast("int"), lit("%")),
).show()

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|    62%|
|   Yes|   93|    38%|
+------+-----+-------+



In [23]:
tips.withColumn("tip_percentage", col('tip') / col('total_bill')).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

In [24]:
(
    tips.withColumn("tip_percentage", col('tip') / col('total_bill'))
    .groupby("sex")
    .pivot("smoker")
    .agg(round(mean("tip_percentage"), 4))
    .show()
)

+------+------+------+
|   sex|    No|   Yes|
+------+------+------+
|Female|0.1569|0.1822|
|  Male|0.1607|0.1528|
+------+------+------+



#### Use the seattle weather dataset referenced in the lesson to answer the questions below.

* Convert the temperatures to fahrenheit.
* Which month has the most rain, on average?
* Which year was the windiest?
* What is the most frequent type of weather in January?
* What is the average high and low temperature on sunny days in July in 2013 and 2014?
* What percentage of days were rainy in q3 of 2015?
* For each year, find what percentage of days it rained (had non-zero precipitation).

In [25]:
from vega_datasets import data

weather = data.seattle_weather()
weather = spark.createDataFrame(weather)
weather.show(4)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 4 rows



making the temps $F^o$

In [26]:
weather = weather.withColumn(
    "temp_max", (col("temp_max") * 9 / 5 + 32)
).withColumn("temp_min", (col("temp_min") * 9 / 5 + 32))

In [27]:
weather.show(3) ## quality assurance check

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 3 rows



In [28]:
## looking at which month has the most rain on average

row = (
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)
row

Row(month=11, avg_monthly_rain=160.625)

In [30]:
## looking at which year is the windiest


(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .head(5)
)

[Row(year=2012, total_winds=1244.7),
 Row(year=2014, total_winds=1236.5000000000007),
 Row(year=2015, total_winds=1153.3000000000002),
 Row(year=2013, total_winds=1100.8000000000006)]

In [31]:
## looking at the most frequent weather type in January

(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupBy("weather")
    .count()
    .sort(col("count").desc())
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [32]:
## looking at the average high and low temps on a sunny 
## days in July 2013 and 2014


(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show()
)

+-----------------+-----------------+
|average_high_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



In [33]:
## looking at the percentage of rainy days in the 3rd quarter of 
## 2015

(    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



In [None]:
## looking at days in each year that had anything > 0 precipitation


# measure a rainy day by precipitation > 0
(
    weather.withColumn("year", year("date"))
    .select(when(col("precipitation") > 0, 1).otherwise(0).alias("rain"), "year")
    .groupby("year")
    .agg(mean("rain"))
    .show()
)