In [1]:
import pandas as pd
import numpy as np
import pyspark
from pydataset import data
from pyspark.sql.functions import *
from pyspark.sql.types import *


## 1. Create a spark data frame that contains your favorite programming languages.
> - The name of the column should be `language`
> - View the schema of the dataframe
> - Output the shape of the dataframe
> - Show the first 5 records in the dataframe

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
# Create the dataframe
df = pd.DataFrame([{'language': 'Python'},
                   {'language': 'Java'},
                   {'language': 'JavaScript'},
                   {'language': 'R'},
                   {'language': 'HTML'}])

# Convert dataframe to spark 
df = spark.createDataFrame(df)

# View the schema of the dataframe
df.printSchema()

# Output the shape of the dataframe
print(len(df.columns), df.count())

# Show the first 5 records
df.show()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/08 10:48:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- language: string (nullable = true)



                                                                                

1 5
+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|         R|
|      HTML|
+----------+



## 2. Load the mpg dataset as a spark dataframe.

> a. Create 1 column of output that contains a message like the one below:
    * ```The 1999 audi a4 has a 4 cylinder engine.```
    
    For each vehicle

In [3]:
# Load the mpg dataset as a sparkdataframe
mpg = spark.createDataFrame(data("mpg"))
# Examine the dataframe
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [4]:
# Build the message by concat the year, man, model, and cyl and assign alias
mpg.select(
    concat(
        lit("The "),  # The literal string
        col("year"),
        lit(" "),
        col("manufacturer"),
        lit(" "),
        col("model"),
        lit(" has a "),
        col("cyl"),
        lit(" cylinder engine."),
    ).alias("vehicle_cylinder_desc")
).show(truncate=False)

+--------------------------------------------------------------+
|vehicle_cylinder_desc                                         |
+--------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 2008 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 6 cylinder engine.             |
|The 1999 audi a4 quattro

> b. Transform the trans column so that it only contains either manual or auto.

In [5]:
df1 = mpg.withColumn('transmission', split(mpg['trans'], "\(").getItem(0))
df1.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|transmission|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|        auto|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|      manual|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|      manual|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|        auto|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|        auto|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+------------+
only showing top 5 rows



## 3. Load the tips dataset as a spark dataframe.
>    a. What percentage of observations are smokers?

>    b. Create a column that contains the tip percentage

>    c. Calculate the average tip percentage for each combination of sex and smoker.

In [6]:
# Load tips data
tips = spark.createDataFrame(data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### a. What percentage of observations are smokers?

In [7]:
# group smokers together than concat the percentage to the integer
tips.groupby('smoker').count().withColumn(
    'percent',
    concat(round((col('count')/tips.count() * 100), 0).cast('int'), lit('%')),
          )
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### b. Create a column that contains the tip percentage

In [8]:
# Calculate the tip over the total bill to get percentage
df = tips.withColumn('tip_percent',
                concat(round(col('tip') / col('total_bill') * 100, 2)))
df.show()

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percent|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|       5.94|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|      16.05|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|      16.66|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|      13.98|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|      14.68|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|      18.62|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|      22.81|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|      11.61|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|      13.03|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|      21.85|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|      16.65|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|      14.18|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|      10.18|
|     18.43| 3.0|  Male|    No|Sun|Dinne

### c. Calculate the average tip percentage for each combination of sex and smoker.

In [9]:
df.groupby('smoker').pivot('sex').agg({'tip_percent': 'mean'}).show()

+------+------------------+------------------+
|smoker|            Female|              Male|
+------+------------------+------------------+
|    No| 15.69111111111111| 16.06659793814433|
|   Yes|18.214545454545455|15.276666666666667|
+------+------------------+------------------+



##  Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [10]:
from vega_datasets import data
# Import and create the spark dataframe
weather = spark.createDataFrame(data.seattle_weather())
weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



### A. Convert the temperatures to farenheight.


In [16]:
weather = weather.withColumn('temp_max', expr('round(temp_max * (9/5) + 32, 2)')).withColumn('temp_min', expr('round(temp_min * (9/5) + 32, 2)'))

weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



### B. Which month has the most rain, on average?

In [20]:

row = (
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)
print(row.month)
print(row.avg_monthly_rain)

11
160.625


### C. Which year was the windiest?

In [30]:
max_wind = weather.withColumn(
    "year", year("date")).groupBy("year").agg(sum("wind").alias("total_winds")).sort(col("total_winds").desc()).collect()[0]
    
max_wind

Row(year=2012, total_winds=1244.7)

### D. What is the most frequent type of weather in January?

In [32]:
weather_type = (weather.withColumn("month", month("date"))
    .filter(col("month") == 1) # This is the month of jav
    .groupBy("weather") # Group by the type of weather
    .count() # Count the frequency
    .sort(col("count").desc()) # Sort the frequencies
    .collect()[0]) # this selects the first row
    
weather_type

Row(weather='fog', count=38)

### E. What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [35]:
avg_h_l = (weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show())

+-----------------+-----------------+
|average_high_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



### F. What percentage of days were rainy in q3 of 2015?

In [37]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



### G. For each year, find what percentage of days it rained (had non-zero precipitation).

In [38]:
(
    weather.withColumn("year", year("date"))
    .select(when(col("precipitation") > 0, 1).otherwise(0).alias("rain"), "year")
    .groupby("year")
    .agg(mean("rain"))
    .show()
)

+----+-------------------+
|year|          avg(rain)|
+----+-------------------+
|2012|0.48360655737704916|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2015|0.39452054794520547|
+----+-------------------+

