In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [39]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [4]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [5]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [6]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [7]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [8]:
mpg.hwy

Column<b'hwy'>

In [9]:
mpg.select(mpg.hwy, mpg.cty, mpg.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [10]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show(10)

+---+---+----------+
|hwy|cty|     model|
+---+---+----------+
| 29| 18|        a4|
| 29| 21|        a4|
| 31| 20|        a4|
| 30| 21|        a4|
| 26| 16|        a4|
| 26| 18|        a4|
| 27| 18|        a4|
| 26| 18|a4 quattro|
| 25| 16|a4 quattro|
| 28| 20|a4 quattro|
+---+---+----------+
only showing top 10 rows



In [11]:
mpg.hwy + 1

Column<b'(hwy + 1)'>

In [12]:
mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



In [13]:
mpg.select(mpg.hwy.alias("highway_mileage")).show(5)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



# Exercises

### 1. Create a spark data frame that contains your favorite programming languages.
- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [14]:
import pandas as pd
import numpy as np
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [15]:
# Making the data frame
data = [['python'],['SQL'],['Spark'],['HTML']]

df = pd.DataFrame(data, columns = ['language'])

df

Unnamed: 0,language
0,python
1,SQL
2,Spark
3,HTML


In [16]:
# converting the dataframe to Spark
sp_df = spark.createDataFrame(df)
sp_df

DataFrame[language: string]

In [36]:
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [37]:
print(sp_df.count(), " x ", len(sp_df.columns))

4  x  1


In [17]:
sp_df.show()

+--------+
|language|
+--------+
|  python|
|     SQL|
|   Spark|
|    HTML|
+--------+



### 2. Load the mpg dataset as a spark dataframe.

- A. Create 1 column of output that contains a message like the one below:

The 1999 audi a4 has a 4 cylinder engine.

In [18]:
from pydataset import data

mpg_pd = data("mpg")
mpg_pd.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [19]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import round, concat, sum, min, max, count, avg, mean, when
from pyspark.sql.functions import lit

In [21]:
mpg.select(concat(mpg.cyl, lit(" cylinders")).alias("cylinders")).show(3)
mpg.select

+-----------+
|  cylinders|
+-----------+
|4 cylinders|
|4 cylinders|
|4 cylinders|
+-----------+
only showing top 3 rows



<bound method DataFrame.select of DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]>

In [22]:
# The 1999 audi a4 has a 4 cylinder engine.
#mpg.select(concat(mpg.cyl, lit(" cylinders")).alias("cylinders")).show(3)

mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, lit(" "),
                  mpg.model, lit(" has a "), mpg.cyl,
                  lit(" engine")).alias("cylinders")).show()

+--------------------+
|           cylinders|
+--------------------+
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a6 ...|
|The 2008 audi a6 ...|
|The 2008 audi a6 ...|
|The 2008 chevrole...|
|The 2008 chevrole...|
+--------------------+
only showing top 20 rows



### B. Transform the trans column so that it only contains either manual or auto.

In [23]:
mpg.select(mpg.trans, when(mpg.trans.contains("auto"), "auto").otherwise("manual").alias("Trans")).show(5)

+----------+------+
|     trans| Trans|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
+----------+------+
only showing top 5 rows



### 3. Load the tips dataset as a spark dataframe.

In [24]:
tips_pd = data("tips")
tips_pd.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
tips = spark.createDataFrame(data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### A. What percentage of observations are smokers?

In [26]:
tips.filter(tips.smoker == 'Yes').count() / tips.count() * 100

38.114754098360656

### B. Create a column that contains the tip percentage

In [27]:
tips = tips.select(tips.total_bill,
                   tips.tip,
                   tips.sex,
                   tips.smoker,
                   tips.day,
                   tips.time,
                   tips.size,(tips.total_bill / tips.tip).alias('tip_percent'))
tips.show(5)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|       tip_percent|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 16.82178217821782|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 6.228915662650603|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 6.002857142857144|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|7.1540785498489425|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 6.811634349030471|
+----------+----+------+------+---+------+----+------------------+
only showing top 5 rows



### C. Calculate the average tip percentage for each combination of sex and smoker.

In [28]:
tips.groupBy(tips.smoker, tips.sex).agg(round(avg(tips.tip/tips.total_bill * 100), 2))

DataFrame[smoker: string, sex: string, round(avg(((tip / total_bill) * 100)), 2): double]

In [29]:
tips.groupBy(tips.smoker, tips.sex).agg(round(avg(tips.tip/tips.total_bill * 100), 2)).show()

+------+------+-----------------------------------------+
|smoker|   sex|round(avg(((tip / total_bill) * 100)), 2)|
+------+------+-----------------------------------------+
|    No|Female|                                    15.69|
|    No|  Male|                                    16.07|
|   Yes|  Male|                                    15.28|
|   Yes|Female|                                    18.22|
+------+------+-----------------------------------------+



### 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to farenheight.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [30]:
from vega_datasets import data
weather_pd = data("seattle_weather")
weather_pd.head(5)

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [31]:
#weather_pd = weather_pd.assign(date=lambda df: df.date.astype(str))

In [32]:
weather = spark.createDataFrame(weather_pd)
weather.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|     6.1|    -1.7| 1.9|    

### Convert the temperatures to farenheight.

In [33]:
weather = weather.withColumn("temp_max", round(weather.temp_max*9/5 + 32, 2))
weather.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|   39.92|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|   44.96|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    50.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|   48.92|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|   42.98|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|   42.98|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|   42.98|    -1.7| 1.9|    

In [34]:
weather = weather.withColumn("temp_min", round(weather.temp_min*9/5 + 32, 2))
weather.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|   39.92|   35.96| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|   44.96|   37.04| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    50.0|   37.04| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|   48.92|    41.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|   42.98|   33.08| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|   42.98|   30.02| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|   42.98|   28.94| 1.9|    

### Which month has the most rain, on average?

In [40]:
### Must import sql functions

(
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first() # shows only first row
)

Row(month=11, avg_monthly_rain=160.625)

### Which year was the windiest?

In [41]:
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .first()
)

Row(year=2012, total_winds=1244.7000000000003)

### What is the most frequent type of weather in january?

In [42]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupBy("weather")
    .count()
    .sort(col("count").desc())
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



### What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [43]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show()
)

+-----------------+-----------------+
|average_high_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



### What percentage of days were rainy in q3 of 2015?

In [44]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



### For each year, find what percentage of days it rained (had non-zero precipitation).

In [45]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("precipitation") > 0, 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+-------------------+
|          avg(rain)|
+-------------------+
|0.18478260869565216|
+-------------------+

