# Create a spark data frame that contains your favorite programming languages.

In [209]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql.functions import min, max, sum, count, mean, avg
from vega_datasets import data
from pyspark.sql.functions import month, year, quarter, round

In [8]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/30 13:45:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [15]:
the_dict = {'Language': ['Python', 'Java', 'Javascript', 'HTML', 'Spark']}
pandas_dataframe = pd.DataFrame(the_dict)
df = spark.createDataFrame(pandas_dataframe)
df.show()

+----------+
|  Language|
+----------+
|    Python|
|      Java|
|Javascript|
|      HTML|
|     Spark|
+----------+



# View the schema of the dataframe

In [12]:
df.printSchema()

root
 |-- Languages: string (nullable = true)



# Output the shape of the dataframe

In [14]:
df.count(), len(df.columns)

(4, 1)

Show the first 5 records in the dataframe

In [19]:
df.show(5)

+----------+
|  Language|
+----------+
|    Python|
|      Java|
|Javascript|
|      HTML|
|     Spark|
+----------+



# Load the mpg dataset as a spark dataframe.

In [139]:
df = spark.createDataFrame(data('mpg'))
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



# Create 1 column of output that contains a message like the one below:

The 1999 audi a4 has a 4 cylinder engine.

In [64]:
year = df.collect()[2]['year']

In [65]:
audi = df.collect()[2]['manufacturer']

In [70]:
cyl = df.collect()[2]['cyl']

In [72]:
print('The', year, audi, 'has a',  cyl, 'cylinder engine')

The 2008 audi has a 4 cylinder engine


# Transform the trans column so that it only contains either manual or auto.

In [141]:
df = df.withColumn(
    'trans',
    when(df.trans.contains('manual'), 'maunual').when(df.trans.contains('auto'), 'auto').alias('trans')
    
)

In [142]:
df.show()

+------------+------------------+-----+----+---+-------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|  trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+-------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|   auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|maunual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|maunual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|   auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|   auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|maunual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|   auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|maunual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|   auto|  4| 16| 25|  p|compact|
|        audi|        a4 qua

# Load the tips dataset as a spark dataframe.

In [143]:
df = spark.createDataFrame(data('tips'))
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



# What percentage of observations are smokers?

In [156]:
no = df[df.smoker == 'No'].count()
no

151

In [157]:
yes = df[df.smoker == 'Yes'].count()
yes

93

In [162]:
round((yes * 100) / (no + yes), 2)

38.11

# Create a column that contains the tip percentage

In [193]:
df =df.withColumn(
    'tip percentage',
    (col('tip') / (col('total_bill') + col('tip')))
)

# Calculate the average tip percentage for each combination of sex and smoker.

In [194]:
df.groupBy(df.smoker, df.sex).agg(mean(df['tip percentage'])).show()

[Stage 105:>                                                        (0 + 8) / 8]

+------+------+-------------------+
|smoker|   sex|avg(tip percentage)|
+------+------+-------------------+
|    No|Female|0.13478283982479486|
|    No|  Male| 0.1373304244367829|
|   Yes|  Male|0.12833233983883358|
|   Yes|Female| 0.1512197715202941|
+------+------+-------------------+



                                                                                

# Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [290]:
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



# Convert the temperatures to fahrenheit

In [216]:
weather = weather.withColumn(
    'temp_max',
    ((col('temp_max') * 9/5) + 32)
)

In [215]:
weather = weather.withColumn(
    'temp_min',
    ((col('temp_min') * 9/5) + 32)
)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|   37.04| 4.5|   rain|
|2012-01-03|          0.8|    11.7|   44.96| 2.3|   rain|
|2012-01-04|         20.3|    12.2|   42.08| 4.7|   rain|
|2012-01-05|          1.3|     8.9|   37.04| 6.1|   rain|
|2012-01-06|          2.5|     4.4|   35.96| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



# Which month has the most rain, on average?

# november

In [228]:
(
    weather.withColumn('themonth', month('date'))
    .groupby('themonth')
    .agg(round(mean('precipitation'), 2).alias('mean_rain'))
    .sort('mean_rain')
).show()

[Stage 142:>                                                        (0 + 8) / 8]

+--------+---------+
|themonth|mean_rain|
+--------+---------+
|       7|     0.39|
|       6|     1.11|
|       8|     1.32|
|       5|     1.67|
|       9|     1.96|
|       4|     3.13|
|       2|     3.73|
|       1|     3.76|
|      10|     4.06|
|       3|     4.89|
|      12|     5.02|
|      11|     5.35|
+--------+---------+



                                                                                

# Which year was the windiest?

# 2012

In [229]:
(
    weather.withColumn('theyear', year('date'))
    .groupby('theyear')
    .agg(round(mean('wind'), 2).alias('mean_wind'))
    .sort('mean_wind')
).show()

+-------+---------+
|theyear|mean_wind|
+-------+---------+
|   2013|     3.02|
|   2015|     3.16|
|   2014|     3.39|
|   2012|      3.4|
+-------+---------+



# What is the most frequent type of weather in January?

# sun

In [293]:
weather = weather.withColumn('themonth', month('date'))

In [247]:
weather.show(5)

+----------+-------------+--------+--------+----+-------+--------+
|      date|precipitation|temp_max|temp_min|wind|weather|themonth|
+----------+-------------+--------+--------+----+-------+--------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|       1|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|       1|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|       1|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|       1|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|       1|
+----------+-------------+--------+--------+----+-------+--------+
only showing top 5 rows



In [256]:
(
    weather[weather.themonth == 1].withColumn('theyear', month('date'))
    .groupby('weather')
    .agg(count('weather').alias('count of'))
    .sort('weather')
).show()

+-------+--------+
|weather|count of|
+-------+--------+
|drizzle|      10|
|    fog|      38|
|   rain|      35|
|   snow|       8|
|    sun|      33|
+-------+--------+



# What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [292]:
weather = weather.withColumn('theyear', year('date'))

In [271]:
answer = weather[(weather.theyear == 2013) | (weather.theyear == 2014)]

In [278]:
answer = answer[(answer.weather == 'sun') & (answer.themonth == 7)]

In [280]:
answer.show(3)

+----------+-------------+--------+--------+----+-------+--------+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|themonth|theyear|
+----------+-------------+--------+--------+----+-------+--------+-------+
|2013-07-01|          0.0|    31.7|    18.3| 2.3|    sun|       7|   2013|
|2013-07-02|          0.0|    28.3|    15.6| 3.0|    sun|       7|   2013|
|2013-07-03|          0.0|    26.1|    16.7| 3.2|    sun|       7|   2013|
+----------+-------------+--------+--------+----+-------+--------+-------+
only showing top 3 rows



In [281]:
(
    answer
    .groupby('theyear')
    .agg(round(mean('temp_min'), 2).alias('temp_min_mean'))
    .sort('temp_min_mean')
).show()

+-------+-------------+
|theyear|temp_min_mean|
+-------+-------------+
|   2013|        13.98|
|   2014|         14.4|
+-------+-------------+



In [282]:
(
    answer
    .groupby('theyear')
    .agg(round(mean('temp_max'), 2).alias('temp_max_mean'))
    .sort('temp_max_mean')
).show()

+-------+-------------+
|theyear|temp_max_mean|
+-------+-------------+
|   2013|        26.59|
|   2014|        27.09|
+-------+-------------+



# What percentage of days were rainy in q3 of 2015?

In [318]:
((72 / 488) * 100)

14.754098360655737

In [308]:
the_answer = weather[(weather.themonth == 12) | (weather.themonth == 11) | (weather.themonth == 10) | (weather.themonth == 9) ]

In [310]:
the_answer.show(3)

+----------+-------------+--------+--------+----+-------+-------+--------+
|      date|precipitation|temp_max|temp_min|wind|weather|theyear|themonth|
+----------+-------------+--------+--------+----+-------+-------+--------+
|2012-09-01|          0.0|    21.7|    10.6| 2.1|    sun|   2012|       9|
|2012-09-02|          0.0|    21.1|    10.0| 2.0|    sun|   2012|       9|
|2012-09-03|          0.0|    22.8|    12.8| 3.3|    sun|   2012|       9|
+----------+-------------+--------+--------+----+-------+-------+--------+
only showing top 3 rows



In [312]:
(
    the_answer
    .groupby('weather')
    .agg(count('weather').alias('count'))
    .sort('count')
).show()

+-------+-----+
|weather|count|
+-------+-----+
|   snow|    5|
|drizzle|   14|
|   rain|   72|
|    sun|  198|
|    fog|  199|
+-------+-----+



# For each year, find what percentage of days it rained (had non-zero precipitation).

In [301]:
weather.show(3)

+----------+-------------+--------+--------+----+-------+-------+--------+
|      date|precipitation|temp_max|temp_min|wind|weather|theyear|themonth|
+----------+-------------+--------+--------+----+-------+-------+--------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|   2012|       1|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|   2012|       1|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|   2012|       1|
+----------+-------------+--------+--------+----+-------+-------+--------+
only showing top 3 rows



In [304]:
(
    weather[weather.precipitation != 0]
    .groupby('theyear')
    .agg(round(mean('precipitation'), 2).alias('precipitation'))
    .sort('precipitation')
).show()

+-------+-------------+
|theyear|precipitation|
+-------+-------------+
|   2013|         5.45|
|   2012|         6.93|
|   2015|         7.91|
|   2014|         8.22|
+-------+-------------+

