In [2]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import functions as F

# 1. Create a spark data frame that contains your favorite programming languages.
- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/19 09:39:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
lang = spark.createDataFrame(
    pd.DataFrame(
        {
            "language": ['python', 'sql', 'c#', 'vba', 'java']
        }
    )
)

In [9]:
# View the schema of the dataframe
lang.schema

StructType([StructField('language', StringType(), True)])

In [11]:
# Output the shape of the dataframe
lang.count(), len(lang.columns)

(5, 1)

In [12]:
# Show the first 5 records in the dataframe
lang.show(5)

+--------+
|language|
+--------+
|  python|
|     sql|
|      c#|
|     vba|
|    java|
+--------+



# 2. Load the mpg dataset as a spark dataframe.
- Create 1 column of output that contains a message like the one below:

 - The 1999 audi a4 has a 4 cylinder engine.
- For each vehicle.
- Transform the trans column so that it only contains either manual or auto.

In [13]:
from pydataset import data

In [14]:
mpg = data('mpg')

In [15]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [16]:
mpg = spark.createDataFrame(mpg)

In [17]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [35]:
# The 1999 audi a4 has a 4 cylinder engine.
mpg.select(F.concat(
    F.lit('The '),
    F.col('year'), 
    F.lit(' '),
    F.col('manufacturer'), 
    F.lit(' '),
    F.col('model'),
    F.lit(' has a '),
    F.col('cyl'),
    F.lit(' cylinder engine')
).alias('message')).show(5, truncate=False)

+----------------------------------------+
|message                                 |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



In [41]:
# Transform the trans column so that it only contains either manual or auto.
mpg = mpg.withColumn('trans', 
    F.regexp_extract('trans', '^([a-z]+?)\(', 1)
)
mpg.show(10)

+------------+----------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual|  4| 20| 28|  p|compact|
+------------+----------+-----+----+---+------+---+---+---+---+-

# 3. Load the tips dataset as a spark dataframe.
- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [42]:
tips = data('tips')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


In [43]:
tips = spark.createDataFrame(tips)

In [52]:
# What percentage of observations are smokers?
(tips.filter(tips.smoker == 'Yes').count() / tips.count()) * 100

38.114754098360656

In [57]:
# Create a column that contains the tip percentage
tips = tips.withColumn('tip_perc',
       F.round((tips.tip / tips.total_bill) * 100,2)
               )
tips.show(5)

+----------+----+------+------+---+------+----+--------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_perc|
+----------+----+------+------+---+------+----+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    5.94|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   16.05|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   16.66|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   13.98|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   14.68|
+----------+----+------+------+---+------+----+--------+
only showing top 5 rows



In [65]:
# Calculate the average tip percentage for each combination of sex and smoker.
tips.groupby(tips.sex, tips.smoker).avg('tip_perc').show(5)

+------+------+------------------+
|   sex|smoker|     avg(tip_perc)|
+------+------+------------------+
|  Male|    No| 16.06659793814433|
|Female|    No| 15.69111111111111|
|  Male|   Yes|15.276666666666667|
|Female|   Yes|18.214545454545455|
+------+------+------------------+



# 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.
- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [83]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [80]:
(12.8 * 9 /5) + 32, (5.0 * 9 /5) + 32

(55.04, 41.0)

### Convert the temperatures to fahrenheit.

In [86]:
# (deg_c * 9 / 5) + 32 = deg_f
weather = weather.withColumn('temp_max',
            (F.round((weather.temp_max * 9 / 5),1) + 32)
                   )
weather = weather.withColumn('temp_min',
            (F.round((weather.temp_min * 9 / 5),1) + 32)
                  )
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    55.0|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|    51.1|    37.0| 4.5|   rain|
|2012-01-03|          0.8|    53.1|    45.0| 2.3|   rain|
|2012-01-04|         20.3|    54.0|    42.1| 4.7|   rain|
|2012-01-05|          1.3|    48.0|    37.0| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [90]:

weather = weather.withColumn('month',
                  F.month(F.col('date'))
                  )
weather.show(3)

+----------+-------------+--------+--------+----+-------+-----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|
+----------+-------------+--------+--------+----+-------+-----+
|2012-01-01|          0.0|    55.0|    41.0| 4.7|drizzle|    1|
|2012-01-02|         10.9|    51.1|    37.0| 4.5|   rain|    1|
|2012-01-03|          0.8|    53.1|    45.0| 2.3|   rain|    1|
+----------+-------------+--------+--------+----+-------+-----+
only showing top 3 rows



### Which month has the most rain, on average?

In [231]:
weather.\
    filter((weather.weather == 'rain') | (weather.weather == 'drizzle')).\
    groupby('month').\
    agg(F.round(
        F.avg('precipitation'), 2).\
        alias('avg_rain')
        ).\
    sort(F.col('avg_rain').desc()).\
    show(3)

+-----+--------+
|month|avg_rain|
+-----+--------+
|   10|    8.06|
|   11|    7.52|
|    1|    5.14|
+-----+--------+
only showing top 3 rows



#### October

## Which year was the windiest?

In [218]:
weather.withColumn('year', F.year(F.col('date'))).groupby('year').\
    avg('wind').sort(F.col('avg(wind)').desc()).show(4)


[Stage 364:>                                                        (0 + 8) / 8]

+----+------------------+
|year|         avg(wind)|
+----+------------------+
|2012| 3.400819672131148|
|2014| 3.387671232876714|
|2015| 3.159726027397261|
|2013|3.0158904109589058|
+----+------------------+



                                                                                

### 2012

## What is the most frequent type of weather in January?

In [140]:
weather.filter(weather.month == 1).groupby('weather').count().\
    sort(F.col('count').desc()).show()

[Stage 185:>                                                        (0 + 8) / 8]

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



                                                                                

### Fog

## What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [168]:
weather.withColumn('year', F.year(F.col('date'))).\
    filter((F.col('year') == 2013) | (F.col('year') == 2014)).\
    filter(F.col('month') == 7).filter(F.col('weather') == 'sun').\
    agg(F.round(F.avg(F.col('temp_min')), 1).alias('avg_min_temp'), 
        F.round(F.avg(F.col('temp_max')), 1).alias('avg_max_temp')).show()

+------------+------------+
|avg_min_temp|avg_max_temp|
+------------+------------+
|        57.5|        80.3|
+------------+------------+



## What percentage of days were rainy in q3 of 2015?

In [201]:
round(((weather.withColumn('quarter', F.quarter(F.col('date'))).\
    withColumn('year', F.year(F.col('date'))).\
    filter(F.col('quarter') == 3).\
    filter(F.col('year') == 2015).\
    filter(F.col('weather') == 'rain').\
    count()
) / 
(weather.withColumn('quarter', F.quarter(F.col('date'))).\
    withColumn('year', F.year(F.col('date'))).\
    filter(F.col('quarter') == 3).\
    filter(F.col('year') == 2015).\
    count()
)) * 100, 2)

2.17

### 2.17%

## For each year, find what percentage of days it rained (had non-zero precipitation).

In [217]:
weather.withColumn('year', F.year(F.col('date'))).\
    filter(F.col('precipitation') != 0).\
    groupby(F.col('year')).\
    agg(
        F.round(F.count('year') / 365 * 100, 2).\
        alias('precip_days_perc')
        ).\
    show()

+----+----------------+
|year|precip_days_perc|
+----+----------------+
|2012|           48.49|
|2013|           41.64|
|2014|            41.1|
|2015|           39.45|
+----+----------------+

