In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
import pandas as pd
import pydataset

In [6]:
tips = pydataset.data('tips')
df = spark.createDataFrame(tips)
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [9]:
# Don't do this!
# just use .show to view df contents
df2 = df.show(10)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 10 rows



In [11]:
type(df2)

NoneType

In [15]:
df.head(5)

[Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)]

In [18]:
df.select('total_bill', 'tip', 'size', 'day').show()

+----------+----+----+---+
|total_bill| tip|size|day|
+----------+----+----+---+
|     16.99|1.01|   2|Sun|
|     10.34|1.66|   3|Sun|
|     21.01| 3.5|   3|Sun|
|     23.68|3.31|   2|Sun|
|     24.59|3.61|   4|Sun|
|     25.29|4.71|   4|Sun|
|      8.77| 2.0|   2|Sun|
|     26.88|3.12|   4|Sun|
|     15.04|1.96|   2|Sun|
|     14.78|3.23|   2|Sun|
|     10.27|1.71|   2|Sun|
|     35.26| 5.0|   4|Sun|
|     15.42|1.57|   2|Sun|
|     18.43| 3.0|   4|Sun|
|     14.83|3.02|   2|Sun|
|     21.58|3.92|   2|Sun|
|     10.33|1.67|   3|Sun|
|     16.29|3.71|   3|Sun|
|     16.97| 3.5|   3|Sun|
|     20.65|3.35|   3|Sat|
+----------+----+----+---+
only showing top 20 rows



In [19]:
df.select('*')

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [25]:
df.select(df.tip / df.total_bill).show(5)

+-------------------+
| (tip / total_bill)|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
+-------------------+
only showing top 5 rows



In [26]:
col = df.tip / df.total_bill
col

Column<b'(tip / total_bill)'>

In [29]:
df.select('*', col.alias('tip_pct')).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [31]:
df_with_tip_pct = df.select('*', col.alias('tip_pct'))

In [33]:
df_with_tip_pct.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [52]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [35]:
df.select(mean(df.tip), sum(df.total_bill)).show()

+------------------+-----------------+
|          avg(tip)|  sum(total_bill)|
+------------------+-----------------+
|2.9982786885245907|4827.769999999999|
+------------------+-----------------+



In [39]:
df.select(concat('day', lit(' '), 'time')).show(5)

+--------------------+
|concat(day,  , time)|
+--------------------+
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
+--------------------+
only showing top 5 rows



In [45]:
df.select(df.time.cast('int')).show(5)

+----+
|time|
+----+
|null|
|null|
|null|
|null|
|null|
+----+
only showing top 5 rows



In [50]:
df.select(
    'time',
    regexp_extract('time', r'(\w).*', 1).alias('first_letter'),
    regexp_replace('time', r'[aeiou]', 'X')
).show(5)

+------+------------+--------------------------------+
|  time|first_letter|regexp_replace(time, [aeiou], X)|
+------+------------+--------------------------------+
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
+------+------------+--------------------------------+
only showing top 5 rows



In [53]:
df = df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct')
)

In [57]:
df.select(
    'tip_pct',
    (when(df.tip_pct > .2, 'good tip')
     .otherwise('not good tip')
     .alias('tip_desc'))
).show(25)

+-------------------+------------+
|            tip_pct|    tip_desc|
+-------------------+------------+
|0.05944673337257211|not good tip|
|0.16054158607350097|not good tip|
|0.16658733936220846|not good tip|
| 0.1397804054054054|not good tip|
|0.14680764538430255|not good tip|
|0.18623962040332148|not good tip|
|0.22805017103762829|    good tip|
|0.11607142857142858|not good tip|
|0.13031914893617022|not good tip|
| 0.2185385656292287|    good tip|
| 0.1665043816942551|not good tip|
|0.14180374361883155|not good tip|
|0.10181582360570687|not good tip|
|0.16277807921866522|not good tip|
|0.20364126770060686|    good tip|
|0.18164967562557924|not good tip|
| 0.1616650532429816|not good tip|
|0.22774708410067526|    good tip|
|0.20624631703005306|    good tip|
|0.16222760290556903|not good tip|
|0.22767857142857142|    good tip|
|0.13553474618038444|not good tip|
|0.14140773620798985|not good tip|
|0.19228817858954844|not good tip|
|0.16044399596367306|not good tip|
+-------------------