# Spark API

In [2]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
spark

For demonstration, we'll create a spark dataframe from a pandas dataframe.

In [4]:
import numpy as np
import pandas as pd
import pydataset

In [5]:
tips = pydataset.data('tips')
df = spark.createDataFrame(tips)
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [6]:
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

## DataFrame Basics

In [9]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [13]:
df.head(5)

[Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)]

In [15]:
df.head(5)[0].time

'Dinner'

In [10]:
# Don't do this!
# just use .show to view df contents
df2 = df.show(10)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 10 rows



In [12]:
df2.show()

AttributeError: 'NoneType' object has no attribute 'show'

### Selecting Columns

In [17]:
df.select('total_bill', 'tip', 'size', 'day').show(5)

+----------+----+----+---+
|total_bill| tip|size|day|
+----------+----+----+---+
|     16.99|1.01|   2|Sun|
|     10.34|1.66|   3|Sun|
|     21.01| 3.5|   3|Sun|
|     23.68|3.31|   2|Sun|
|     24.59|3.61|   4|Sun|
+----------+----+----+---+
only showing top 5 rows



In [18]:
df.select('*')

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [19]:
df.select(df.tip / df.total_bill).show(5)

+-------------------+
| (tip / total_bill)|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
+-------------------+
only showing top 5 rows



In [20]:
col = df.tip / df.total_bill
col

Column<'(tip / total_bill)'>

In [25]:
df.select('*', col.alias('tip_pct')).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [21]:
df.select('*', col.alias('tip_pct')).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [27]:
df_with_tip_pct = df.select('*', col.alias('tip_pct'))

In [28]:
df_with_tip_pct.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



To add columns to a spark dataframe, we need to create a new variable.

### Selecting w/ Built In Functions

In [30]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [32]:
mean(df.tip)

Column<'avg(tip)'>

In [34]:
df.select(mean(df.tip), sum(df.total_bill)).show()

+------------------+-----------------+
|          avg(tip)|  sum(total_bill)|
+------------------+-----------------+
|2.9982786885245907|4827.769999999999|
+------------------+-----------------+



In [36]:
df.select(concat('day', ' ', 'time')).show(5)

AnalysisException: cannot resolve '` `' given input columns: [day, sex, size, smoker, time, tip, total_bill];
'Project [concat(day#4, ' , time#5) AS concat(day,  , time)#446]
+- LogicalRDD [total_bill#0, tip#1, sex#2, smoker#3, day#4, time#5, size#6L], false


In [35]:
df.select(concat('day', lit(' '), 'time')).show(5)

+--------------------+
|concat(day,  , time)|
+--------------------+
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
+--------------------+
only showing top 5 rows



In [37]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [40]:
df.size.cast('string')

Column<'CAST(size AS STRING)'>

In [43]:
df.select(df.size.cast('string')).count()

244

In [44]:
df = df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct')
)

In [45]:
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint, tip_pct: double]

### When / Otherwise

In [48]:
when(df.tip_pct > .2, 'good tip')

Column<'CASE WHEN (tip_pct > 0.2) THEN good tip END'>

In [52]:
df.select(
    'tip_pct',
    when(df.tip_pct > .2, 'good tip').otherwise('not good tip').alias('tip_description')
).show(25)

+-------------------+---------------+
|            tip_pct|tip_description|
+-------------------+---------------+
|0.05944673337257211|   not good tip|
|0.16054158607350097|   not good tip|
|0.16658733936220846|   not good tip|
| 0.1397804054054054|   not good tip|
|0.14680764538430255|   not good tip|
|0.18623962040332148|   not good tip|
|0.22805017103762829|       good tip|
|0.11607142857142858|   not good tip|
|0.13031914893617022|   not good tip|
| 0.2185385656292287|       good tip|
| 0.1665043816942551|   not good tip|
|0.14180374361883155|   not good tip|
|0.10181582360570687|   not good tip|
|0.16277807921866522|   not good tip|
|0.20364126770060686|       good tip|
|0.18164967562557924|   not good tip|
| 0.1616650532429816|   not good tip|
|0.22774708410067526|       good tip|
|0.20624631703005306|       good tip|
|0.16222760290556903|   not good tip|
|0.22767857142857142|       good tip|
|0.13553474618038444|   not good tip|
|0.14140773620798985|   not good tip|
|0.192288178

In [53]:
df.select(
    'tip_pct',
    (when(df.tip_pct > .2, 'good tip')
     .otherwise('not good tip')
     .alias('tip_desc'))
).show(25)

+-------------------+------------+
|            tip_pct|    tip_desc|
+-------------------+------------+
|0.05944673337257211|not good tip|
|0.16054158607350097|not good tip|
|0.16658733936220846|not good tip|
| 0.1397804054054054|not good tip|
|0.14680764538430255|not good tip|
|0.18623962040332148|not good tip|
|0.22805017103762829|    good tip|
|0.11607142857142858|not good tip|
|0.13031914893617022|not good tip|
| 0.2185385656292287|    good tip|
| 0.1665043816942551|not good tip|
|0.14180374361883155|not good tip|
|0.10181582360570687|not good tip|
|0.16277807921866522|not good tip|
|0.20364126770060686|    good tip|
|0.18164967562557924|not good tip|
| 0.1616650532429816|not good tip|
|0.22774708410067526|    good tip|
|0.20624631703005306|    good tip|
|0.16222760290556903|not good tip|
|0.22767857142857142|    good tip|
|0.13553474618038444|not good tip|
|0.14140773620798985|not good tip|
|0.19228817858954844|not good tip|
|0.16044399596367306|not good tip|
+-------------------

### Regex

In [59]:
df.select(
    'time',
    regexp_extract('time', r'^.{3}', 1).alias('first_letter'),
    regexp_replace('time', r'[aeiou]', 'X')
).show(5)

Py4JJavaError: An error occurred while calling o358.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 38.0 failed 1 times, most recent failure: Lost task 0.0 in stage 38.0 (TID 59) (192.168.1.52 executor driver): java.lang.IllegalArgumentException: Regex group count is 0, but the specified group index is 1
	at org.apache.spark.sql.catalyst.expressions.RegExpExtractBase$.checkGroupIndex(regexpExpressions.scala:626)
	at org.apache.spark.sql.catalyst.expressions.RegExpExtractBase.checkGroupIndex(regexpExpressions.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at jdk.internal.reflect.GeneratedMethodAccessor55.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IllegalArgumentException: Regex group count is 0, but the specified group index is 1
	at org.apache.spark.sql.catalyst.expressions.RegExpExtractBase$.checkGroupIndex(regexpExpressions.scala:626)
	at org.apache.spark.sql.catalyst.expressions.RegExpExtractBase.checkGroupIndex(regexpExpressions.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


## Transforming Rows

In [60]:
df.show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

### Sorting

In [61]:
df.orderBy(df.total_bill).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|            tip_pct|
+----------+----+------+------+----+------+----+-------------------+
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|0.32573289902280134|
|      5.75| 1.0|Female|   Yes| Fri|Dinner|   2|0.17391304347826086|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|0.13793103448275862|
|      7.25|5.15|  Male|   Yes| Sun|Dinner|   2|  0.710344827586207|
|      7.51| 2.0|  Male|    No|Thur| Lunch|   2| 0.2663115845539281|
|      7.56|1.44|  Male|    No|Thur| Lunch|   2|0.19047619047619047|
|      7.74|1.44|  Male|   Yes| Sat|Dinner|   2|0.18604651162790697|
|      8.35| 1.5|Female|    No|Thur| Lunch|   2|0.17964071856287425|
|      8.51|1.25|Female|    No|Thur| Lunch|   2|0.14688601645123384|
|      8.52|1.48|  Male|    No|Thur| Lunch|   2|0.17370892018779344|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|0.22377622377622378|
|      8.77| 2.0|  Male|    No| Su

In [62]:
df.sort(df.day, df.size).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|      8.58|1.92|  Male|   Yes|Fri| Lunch|   1|0.22377622377622378|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|0.22026431718061676|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2| 0.1427891480247501|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2| 0.1203852327447833|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|0.14662756598240467|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|0.19505851755526657|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|0.14285714285714285|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|0.17391304347826086|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|0.19821605550049554|
|     12.16| 2.2|  Male|   Yes|Fri| Lunch|   2|0.18092105263157895|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2| 0.2593144560357675|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|0

In [64]:
from pyspark.sql.functions import asc, desc, col

In [65]:
df.sort(df.day, asc('time'), desc('size')).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     40.17|4.73|  Male|   Yes|Fri|Dinner|   4| 0.1177495643515061|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|0.15562472209871056|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|0.17391304347826086|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|0.22026431718061676|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|0.19505851755526657|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|0.14285714285714285|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|0.14662756598240467|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|0.10355540214014498|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|0.12468827930174564|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2| 0.1427891480247501|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|0.26348039215686275|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2| 

In [67]:
col('size')

Column<'size'>

In [68]:
col('size').desc()

Column<'size DESC NULLS LAST'>

In [69]:
df.sort(col('size').desc(), col('time')).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|            tip_pct|
+----------+----+------+------+----+------+----+-------------------+
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|0.10379904504878555|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|0.19533527696793004|
|      29.8| 4.2|Female|    No|Thur| Lunch|   6|0.14093959731543623|
|     27.05| 5.0|Female|    No|Thur| Lunch|   6|0.18484288354898337|
|     29.85|5.14|Female|    No| Sun|Dinner|   5| 0.1721943048576214|
|     28.15| 3.0|  Male|   Yes| Sat|Dinner|   5|0.10657193605683837|
|     20.69| 5.0|  Male|    No| Sun|Dinner|   5| 0.2416626389560174|
|     30.46| 2.0|  Male|   Yes| Sun|Dinner|   5|0.06565988181221273|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|0.12138868657441128|
|     16.49| 2.0|  Male|    No| Sun|Dinner|   4|0.12128562765312312|
|      25.0|3.75|Female|    No| Sun|Dinner|   4|               0.15|
|     18.29|3.76|  Male|   Yes| Sa

### Filtering

In [72]:
df.tip < 4

Column<'(tip < 4)'>

In [71]:
# pandas: df[df.tip < 4]
df.where(df.tip < 4).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|0.10181582360570687|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|0

In [77]:
df.where((df.tip < 2) | (df.smoker == 'Yes')).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|0.10181582360570687|
|     10.33|1.67|Female|    No|Sun|Dinner|   3| 0.1616650532429816|
|      9.55|1.45|  Male|    No|Sat|Dinner|   2| 0.1518324607329843|
|      9.68|1.32|  Male|    No|Sun|Dinner|   2|0.13636363636363638|
|      9.94|1.56|  Male|    No|Sun|Dinner|   2|0.15694164989939638|
|     38.01| 3.0|  Male|   Yes|Sat|Dinner|   4|0.07892659826361484|
|     26.41| 1.5|Female|    No|Sat|Dinner|   2|0.05679666792881484|
|     11.24|1.76|  Male|   Yes|Sat|Dinner|   2|0

Sidebar: spark performance

A *shuffle* requires looking at data from multiple partitions, potentially re-organizing the data partitions.

In [79]:
mask = df.tip < 4
df.where(mask).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [80]:
df.filter((df.time == "Dinner") & (df.tip <= 2)).sort('tip').show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|      7.25| 1.0|Female|    No|Sat|Dinner|   1|0.13793103448275862|
|      12.6| 1.0|  Male|   Yes|Sat|Dinner|   2|0.07936507936507936|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|0.17391304347826086|
|      3.07| 1.0|Female|   Yes|Sat|Dinner|   1|0.32573289902280134|
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|      12.9| 1.1|Female|   Yes|Sat|Dinner|   2|0.08527131782945736|
|     32.83|1.17|  Male|   Yes|Sat|Dinner|   2|0.03563813585135547|
|     10.51|1.25|  Male|    No|Sat|Dinner|   2|0.11893434823977164|
|     10.07|1.25|  Male|    No|Sat|Dinner|   2|0.12413108242303872|
|      9.68|1.32|  Male|    No|Sun|Dinner|   2|0.13636363636363638|
|      7.74|1.44|  Male|   Yes|Sat|Dinner|   2|0.18604651162790697|
|      9.55|1.45|  Male|    No|Sat|Dinner|   2| 

Chaining `.where` (or `.filter`) will implicitly AND the conditions together.

In [81]:
df.where(df.smoker == "Yes").where(df.day == "Sat").show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     38.01| 3.0|  Male|   Yes|Sat|Dinner|   4|0.07892659826361484|
|     11.24|1.76|  Male|   Yes|Sat|Dinner|   2|0.15658362989323843|
|     20.29|3.21|  Male|   Yes|Sat|Dinner|   2| 0.1582060128141942|
|     13.81| 2.0|  Male|   Yes|Sat|Dinner|   2| 0.1448225923244026|
|     11.02|1.98|  Male|   Yes|Sat|Dinner|   2|0.17967332123411978|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|0.20557681793329688|
|      3.07| 1.0|Female|   Yes|Sat|Dinner|   1|0.32573289902280134|
|     15.01|2.09|  Male|   Yes|Sat|Dinner|   2|0.13924050632911392|
|     26.86|3.14|Female|   Yes|Sat|Dinner|   2|0.11690245718540582|
|     25.28| 5.0|Female|   Yes|Sat|Dinner|   2|0.19778481012658228|
|     17.92|3.08|  Male|   Yes|Sat|Dinner|   2|           0.171875|
|      44.3| 2.5|Female|   Yes|Sat|Dinner|   3|0

## Aggregating

In [85]:
from pyspark.sql.functions import mean, min, max, count

In [83]:
df.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [89]:
df.groupBy('time').agg(count('*').alias('n_obs')).show()

+------+-----+
|  time|n_obs|
+------+-----+
| Lunch|   68|
|Dinner|  176|
+------+-----+



In [90]:
df.groupBy('time').agg(mean('tip')).show()

+------+------------------+
|  time|          avg(tip)|
+------+------------------+
| Lunch|2.7280882352941176|
|Dinner| 3.102670454545455|
+------+------------------+



In [91]:
df.groupBy('time').agg(min('tip'), mean('tip'), max('tip')).show()

+------+--------+------------------+--------+
|  time|min(tip)|          avg(tip)|max(tip)|
+------+--------+------------------+--------+
| Lunch|    1.25|2.7280882352941176|     6.7|
|Dinner|     1.0| 3.102670454545455|    10.0|
+------+--------+------------------+--------+



In [92]:
df.groupBy('time').agg(mean('tip').alias('avg_tip')).show()

+------+------------------+
|  time|           avg_tip|
+------+------------------+
| Lunch|2.7280882352941176|
|Dinner| 3.102670454545455|
+------+------------------+



In [94]:
df.groupBy('time', 'day').agg(count('*').alias('n'), mean('total_bill')).show()

+------+----+---+------------------+
|  time| day|  n|   avg(total_bill)|
+------+----+---+------------------+
| Lunch|Thur| 61|17.664754098360653|
|Dinner|Thur|  1|             18.78|
| Lunch| Fri|  7|12.845714285714285|
|Dinner| Fri| 12| 19.66333333333333|
|Dinner| Sun| 76|21.409999999999997|
|Dinner| Sat| 87|20.441379310344825|
+------+----+---+------------------+



In [97]:
df.crosstab('time', 'day').show()

+--------+---+---+---+----+
|time_day|Fri|Sat|Sun|Thur|
+--------+---+---+---+----+
|   Lunch|  7|  0|  0|  61|
|  Dinner| 12| 87| 76|   1|
+--------+---+---+---+----+



In [100]:
df.groupBy('time', 'day').agg(sum('total_bill')).sort('time', 'day').show()

+------+----+------------------+
|  time| day|   sum(total_bill)|
+------+----+------------------+
|Dinner| Fri|235.95999999999998|
|Dinner| Sat|1778.3999999999996|
|Dinner| Sun|1627.1599999999999|
|Dinner|Thur|             18.78|
| Lunch| Fri|             89.92|
| Lunch|Thur|           1077.55|
+------+----+------------------+



In [101]:
df.groupBy('time').pivot('day').agg(sum('total_bill')).show()

+------+------------------+------------------+------------------+-------+
|  time|               Fri|               Sat|               Sun|   Thur|
+------+------------------+------------------+------------------+-------+
| Lunch|             89.92|              null|              null|1077.55|
|Dinner|235.95999999999998|1778.3999999999996|1627.1599999999999|  18.78|
+------+------------------+------------------+------------------+-------+



In [102]:
df.groupBy('time').pivot('day').agg(mean('total_bill')).show()

+------+------------------+------------------+------------------+------------------+
|  time|               Fri|               Sat|               Sun|              Thur|
+------+------------------+------------------+------------------+------------------+
| Lunch|12.845714285714285|              null|              null|17.664754098360653|
|Dinner| 19.66333333333333|20.441379310344825|21.409999999999997|             18.78|
+------+------------------+------------------+------------------+------------------+



`.crosstab` is just for counts, for other methods of summarizing groups, use `.groupBy` (maybe in combination with `.pivot`) + `.agg`.

## Additional Features

### Spark SQL

In [103]:
df.createOrReplaceTempView('tips')

In [106]:
spark.sql('''
SELECT *
FROM tips
''').show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [107]:
# find the tip, total_bill, and day with the highest overall sales for that day
spark.sql('''
SELECT tip, total_bill, day
FROM tips
WHERE day = (
    SELECT day
    FROM tips
    GROUP BY day
    ORDER BY sum(total_bill) DESC
    LIMIT 1
)    
''').show()

+----+----------+---+
| tip|total_bill|day|
+----+----------+---+
|3.35|     20.65|Sat|
|4.08|     17.92|Sat|
|2.75|     20.29|Sat|
|2.23|     15.77|Sat|
|7.58|     39.42|Sat|
|3.18|     19.82|Sat|
|2.34|     17.81|Sat|
| 2.0|     13.37|Sat|
| 2.0|     12.69|Sat|
| 4.3|      21.7|Sat|
| 3.0|     19.65|Sat|
|1.45|      9.55|Sat|
| 2.5|     18.35|Sat|
| 3.0|     15.06|Sat|
|2.45|     20.69|Sat|
|3.27|     17.78|Sat|
| 3.6|     24.06|Sat|
| 2.0|     16.31|Sat|
|3.07|     16.93|Sat|
|2.31|     18.69|Sat|
+----+----------+---+
only showing top 20 rows



### More Spark Dataframe Manipulation

In [None]:
df.where(
    df.time == 'Dinner'
).select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).explain()

In [None]:
df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).where(
    df.time == 'Dinner'
).explain()

### Mixing in SQL Expressions

In [109]:
from pyspark.sql.functions import expr

Expr lets us mix in parts of SQL into our dataframes

In [116]:
df.select(
    'time',
    expr('CASE WHEN time = "Dinner" THEN "DINNER TIME!" ELSE "lunch..." END'),
).show(100)

+------+-------------------------------------------------------------+
|  time|CASE WHEN (time = Dinner) THEN DINNER TIME! ELSE lunch... END|
+------+-------------------------------------------------------------+
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinne

In [115]:
spark.sql('''
SELECT
    time,
    CASE WHEN time = "Dinner" THEN "DINNER TIME!" ELSE "lunch..." END
FROM tips
''').show(100)

+------+-------------------------------------------------------------+
|  time|CASE WHEN (time = Dinner) THEN DINNER TIME! ELSE lunch... END|
+------+-------------------------------------------------------------+
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinner|                                                 DINNER TIME!|
|Dinne

In [110]:
df.select(
    '*',
    expr('tip / total_bill as tip_pct')
).where(
    expr('day = "Sun" AND time = "Dinner"')
).show()

+----------+----+------+------+---+------+----+-------------------+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|0.11607142857142858|
|     15.04|1.96|  Male|    No|S

## Joins

In [117]:
df1 = spark.createDataFrame(pd.DataFrame({
    'id': np.arange(100) + 1,
    'x': np.random.randn(100).round(3),
    'group_id': np.random.choice(range(1, 7), 100),
}))

df2 = spark.createDataFrame(pd.DataFrame({
    'id': range(1, 7),
    'group': list('abcdef')
}))
df1.show(5)
df2.show()

+---+------+--------+
| id|     x|group_id|
+---+------+--------+
|  1| 0.093|       5|
|  2| 1.191|       3|
|  3| -0.43|       5|
|  4| 0.908|       2|
|  5|-0.852|       4|
+---+------+--------+
only showing top 5 rows

+---+-----+
| id|group|
+---+-----+
|  1|    a|
|  2|    b|
|  3|    c|
|  4|    d|
|  5|    e|
|  6|    f|
+---+-----+



In [121]:
df1.join(df2, on=df1.group_id == df2.id).show()

+---+------+--------+---+-----+
| id|     x|group_id| id|group|
+---+------+--------+---+-----+
|  7| -0.51|       6|  6|    f|
| 13| 0.588|       6|  6|    f|
| 18| 0.044|       6|  6|    f|
| 37| 0.048|       6|  6|    f|
| 39| 1.034|       6|  6|    f|
| 41|-1.033|       6|  6|    f|
| 42|-0.726|       6|  6|    f|
| 56|-2.016|       6|  6|    f|
| 59| 0.616|       6|  6|    f|
| 68|-0.524|       6|  6|    f|
| 70|-2.272|       6|  6|    f|
| 71|-1.762|       6|  6|    f|
| 72| 2.044|       6|  6|    f|
| 77|-0.517|       6|  6|    f|
| 84|-0.519|       6|  6|    f|
| 91| 0.199|       6|  6|    f|
| 95| 1.107|       6|  6|    f|
| 97| 0.952|       6|  6|    f|
| 99| 2.073|       6|  6|    f|
|  1| 0.093|       5|  5|    e|
+---+------+--------+---+-----+
only showing top 20 rows



In [122]:
df_merged = df1.join(df2, df1.group_id == df2.id)
df_merged.show(5)

+---+-----+--------+---+-----+
| id|    x|group_id| id|group|
+---+-----+--------+---+-----+
|  7|-0.51|       6|  6|    f|
| 13|0.588|       6|  6|    f|
| 18|0.044|       6|  6|    f|
| 37|0.048|       6|  6|    f|
| 39|1.034|       6|  6|    f|
+---+-----+--------+---+-----+
only showing top 5 rows



In [124]:
df1.join(df2.withColumnRenamed('id', 'group_id'), 'group_id').show(5)

+--------+---+-----+-----+
|group_id| id|    x|group|
+--------+---+-----+-----+
|       6|  7|-0.51|    f|
|       6| 13|0.588|    f|
|       6| 18|0.044|    f|
|       6| 37|0.048|    f|
|       6| 39|1.034|    f|
+--------+---+-----+-----+
only showing top 5 rows



## `.explain`

In [127]:
df = spark.createDataFrame(pydataset.data('tips'))

In [132]:
df.filter(df.time == 'Lunch').filter(df.size > 1).select('*', expr('tip / total_bill as tip_pct')).explain()

== Physical Plan ==
*(1) Project [total_bill#2535, tip#2536, sex#2537, smoker#2538, day#2539, time#2540, size#2541L, (tip#2536 / total_bill#2535) AS tip_pct#2549]
+- *(1) Filter (((isnotnull(time#2540) AND isnotnull(size#2541L)) AND (time#2540 = Lunch)) AND (size#2541L > 1))
   +- *(1) Scan ExistingRDD[total_bill#2535,tip#2536,sex#2537,smoker#2538,day#2539,time#2540,size#2541L]




In [133]:
df.filter(df.smoker == 'Yes').select('smoker', 'total_bill', expr('tip / total_bill as tip_pct')).explain()

== Physical Plan ==
*(1) Project [smoker#2538, total_bill#2535, (tip#2536 / total_bill#2535) AS tip_pct#2558]
+- *(1) Filter (isnotnull(smoker#2538) AND (smoker#2538 = Yes))
   +- *(1) Scan ExistingRDD[total_bill#2535,tip#2536,sex#2537,smoker#2538,day#2539,time#2540,size#2541L]




In [134]:
df.select('smoker', 'total_bill', expr('tip / total_bill as tip_pct')).filter(df.smoker == 'Yes').explain()

== Physical Plan ==
*(1) Project [smoker#2538, total_bill#2535, (tip#2536 / total_bill#2535) AS tip_pct#2562]
+- *(1) Filter (isnotnull(smoker#2538) AND (smoker#2538 = Yes))
   +- *(1) Scan ExistingRDD[total_bill#2535,tip#2536,sex#2537,smoker#2538,day#2539,time#2540,size#2541L]




In [137]:
df.select('*', expr('tip / total_bill as tip_pct')).toPandas()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204
