In [1]:
import pyspark

import numpy as np
import pandas as pd
import pydataset

from pyspark.sql.functions import col, expr
from pyspark.sql.functions import lit
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean
from pyspark.sql.functions import when, sum

In [2]:
## make my spark session

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
spark

#### Create a spark data frame that contains your favorite programming languages.

* The name of the column should be language
* View the schema of the dataframe
* Output the shape of the dataframe
* Show the first 5 records in the dataframe

In [4]:
df = pd.DataFrame({'language': ['c++', 'c-sharp', 'python', 'go', 
      'java', 'javascript']}) ## making my pandas df

In [5]:
df.head() ## quality assurance check

Unnamed: 0,language
0,c++
1,c-sharp
2,python
3,go
4,java


In [6]:
df = spark.createDataFrame(df) ## make my spark dataframe

In [7]:
df.printSchema ## looking at the schema

<bound method DataFrame.printSchema of DataFrame[language: string]>

In [8]:
## looking at the shape

print(f' The amount of rows: {(df.count())}, \
The amount of columns: {len(df.columns)}')

 The amount of rows: 6, The amount of columns: 1


In [9]:
df.show(5) ## looking at the first five results

+--------+
|language|
+--------+
|     c++|
| c-sharp|
|  python|
|      go|
|    java|
+--------+
only showing top 5 rows



#### Load the mpg dataset as a spark dataframe.

 - Create 1 column of output that contains a message like the one below:

     - The 1999 audi a4 has a 4 cylinder engine.
     - For each vehicle.
     - Transform the trans column so that it only contains either manual or auto.

In [10]:
## loading the mpg pydataset into spark

mpg = pydataset.data('mpg')
df = spark.createDataFrame(mpg)
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [11]:
summary = concat(lit('The '), df.year, lit(' '), df.manufacturer, 
                 lit(' '), df.model, lit(' has a '), df.cyl,
                lit(' cylinder engine'))

df = df.select('*', summary.alias('summary'))

df.show() ## looking at question 2a

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+--------------------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|             summary|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+--------------------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|The 2008 audi a4 ...|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|The 2008 audi a4 ...|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|The 1999 audi a4 ...|
|        audi|                a4|  3.1|2008|  6|  auto(

In [12]:
df.select('summary').show(truncate = False) ## looking at our summary column

+-------------------------------------------------------------+
|summary                                                      |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 2008 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylind

In [13]:
df.select('trans').show(5) ## looking at our transmission column

+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
|  auto(av)|
|  auto(l5)|
+----------+
only showing top 5 rows



In [14]:
df.trans = df.select(df.trans, 
                     when((df.trans.contains('auto')), 'auto')
                     .otherwise('manual'))

In [15]:
df.select('trans').show()

+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
|  auto(av)|
|  auto(l5)|
|manual(m5)|
|  auto(av)|
|manual(m5)|
|  auto(l5)|
|manual(m6)|
|  auto(s6)|
|  auto(l5)|
|manual(m5)|
|  auto(s6)|
|manual(m6)|
|  auto(l5)|
|  auto(s6)|
|  auto(s6)|
|  auto(l4)|
|  auto(l4)|
+----------+
only showing top 20 rows



#### Load the tips dataset as a spark dataframe.

* What percentage of observations are smokers?
* Create a column that contains the tip percentage
* Calculate the average tip percentage for each combination of sex and smoker.

In [17]:
tips = pydataset.data('tips')
tips = spark.createDataFrame(tips)
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [21]:
## percentage of smokers

tips.where(tips.smoker == "Yes").count() / tips.select('smoker').count()

0.38114754098360654

In [23]:
col = tips.tip / tips.total_bill
col

Column<'(tip / total_bill)'>

In [24]:
tips = tips.select('*', col.alias('tip_pct'))

tips.select('tip_pct').show()

+-------------------+
|            tip_pct|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
|0.18623962040332148|
|0.22805017103762829|
|0.11607142857142858|
|0.13031914893617022|
| 0.2185385656292287|
| 0.1665043816942551|
|0.14180374361883155|
|0.10181582360570687|
|0.16277807921866522|
|0.20364126770060686|
|0.18164967562557924|
| 0.1616650532429816|
|0.22774708410067526|
|0.20624631703005306|
|0.16222760290556903|
+-------------------+
only showing top 20 rows

