In [1]:
import pandas as pd
import numpy as np
import pydataset
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [2]:
np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
df = spark.createDataFrame(pandas_dataframe)

In [4]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [5]:
df[['group', 'n']]

DataFrame[group: string, n: bigint]

In [6]:
df['group']

Column<b'group'>

In [7]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [8]:
n_incremented = df.n + 1

In [9]:
n_incremented

Column<b'(n + 1)'>

In [10]:
df.select(n_incremented).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [11]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [12]:
pd.read_csv('alksdhbfljashbdfjhasd')

FileNotFoundError: [Errno 2] File b'alksdhbfljashbdfjhasd' does not exist: b'alksdhbfljashbdfjhasd'

In [None]:
df.show(5)

In [None]:
df.describe().select('n', 'summary').show()

In [None]:
df.select('n', 'summary').describe().show()

In [None]:
mpg = spark.createDataFrame(pydataset.data('mpg'))
mpg.show()

In [None]:
mpg.hwy

In [None]:
mpg.select(mpg.model, 'manufacturer', mpg.hwy.alias('highway_mileage')).show()

In [None]:
mpg.show(5)

In [None]:
avg_mileage_column = ((mpg.cty + mpg.hwy) / 2).alias('avg_mileage')
avg_mileage_column

In [None]:
mpg.select('*', avg_mileage_column).show()

In [None]:
mpg.printSchema()

In [None]:
from pyspark.sql.functions import col

col

In [None]:
col('abc')

In [None]:
col('hwy') * 2

In [None]:
just_hwy_and_cty = mpg.select('hwy', 'cty').show(5)

just_hwy_and_cty.select('hwy').show()

In [None]:
mpg.select(col('hwy').alias('highway_mileage')).show(5)

In [None]:
from pyspark.sql.functions import expr

In [None]:
mpg.select(expr('hwy AS highway_mileage')).show(5)

In [None]:
mpg.select(expr('(hwy + cty) / 2 AS average_mileage')).show(5)

In [None]:
mpg.select(expr('AVG(hwy)')).show()

In [None]:
mpg.createOrReplaceTempView('mpg')

In [None]:
mpg2 = spark.sql('''
SELECT
    cty AS city,
    hwy AS highway,
    (cty + hwy) / 2 AS avg_mileage
FROM mpg
WHERE class = "compact"
''')

In [None]:
mpg2

In [None]:
mpg2.select('city', 'highway')

In [None]:
mpg.dtypes

In [None]:
mpg.printSchema()

In [None]:
mpg.show(5)

In [None]:
mpg.select('*', mpg.cyl.cast('string').alias('cyl_string')).show(4)

In [None]:
spark.sql('SELECT *, CAST(cyl AS STRING) AS cyl_string FROM mpg').show(4)

In [None]:
mpg.select(mpg.manufacturer.cast('double')).show()

In [None]:
min([1, 2, 3])

In [None]:
from pyspark.sql.functions import min, max

In [None]:
min([1, 2, 3])

In [None]:
mpg.select(min(mpg.cyl).alias('min_cyl'), max('hwy')).show()

In [None]:
spark.sql('SELECT min(cyl), max(hwy) FROM mpg').show()

In [15]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227",
            ]
        }
    )
)

textdf.show(truncate=False)

+---------------------------------------------+
|address                                      |
+---------------------------------------------+
|600 Navarro St ste 600, San Antonio, TX 78205|
|3130 Broadway St, San Antonio, TX 78209      |
|303 Pearl Pkwy, San Antonio, TX 78215        |
|1255 SW Loop 410, San Antonio, TX 78227      |
+---------------------------------------------+



In [None]:
# - extract house/unit no
# - extract street

In [16]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [17]:
regexp_extract?

In [20]:
regexp_extract('address', r'^(\d+)', 1)

Column<b'regexp_extract(address, ^(\\d+), 1)'>

In [33]:
textdf.select(
    'address',
    regexp_extract('address', r'^(\d+)', 1).alias('unit_no'),
    regexp_extract('address', r'^(\d+)\s+(.*?),', 2).alias('street'),
    regexp_replace(
        regexp_replace('address', r'^\d+\s+.*?,\s+', ''),
        r'\s+\d+(-\d+)?$',
        ''
    ),
    regexp_extract('address', r'(\d+(-\d+)?)$', 1).alias('zip'),
).show(truncate=True, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------
 address                                                                     | 600 Navarro St st... 
 unit_no                                                                     | 600                  
 street                                                                      | Navarro St ste 600   
 regexp_replace(regexp_replace(address, ^\d+\s+.*?,\s+, ), \s+\d+(-\d+)?$, ) | San Antonio, TX      
 zip                                                                         | 78205                
-RECORD 1-------------------------------------------------------------------------------------------
 address                                                                     | 3130 Broadway St,... 
 unit_no                                                                     | 3130                 
 street                                                                      | Broadway St 

In [34]:
import pyspark

# type `pyspark.sql.functions.` and hit tab

In [39]:
from pydataset import data

mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [42]:
four_cylinder_filter = mpg.cyl == 4

In [43]:
mpg.filter(four_cylinder_filter).show(7)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
only showing top 7 rows



In [52]:
from pyspark.sql.functions import col

In [54]:
mpg.where(col('class') == 'minivan').filter(four_cylinder_filter).show(5)

+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|       dodge|caravan 2wd|  2.4|1999|  4|auto(l3)|  f| 18| 24|  r|minivan|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+



In [56]:
mpg.where((col('class') == 'minivan') | four_cylinder_filter).show()

+------------+-----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|         a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|         a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|         a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|         a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi| a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi| a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi| a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
|        audi| a4 quattro|  2.0|2008|  4|  auto(s6)|  4| 19| 27|  p|compact|
|   chevrolet|     malibu|  2.4|1999|  4|  auto(l4)|  f| 19| 27|  r|midsize|
|   chevrolet|     malibu|  2.4|2008|  4|  auto(l4)|  f| 22| 30|  r|midsize|

In [48]:
mpg.createOrReplaceTempView('mpg')

In [50]:
spark.sql('''
SELECT DISTINCT class
FROM mpg
''').show()

+----------+
|     class|
+----------+
|subcompact|
|   compact|
|   minivan|
|       suv|
|   midsize|
|    pickup|
|   2seater|
+----------+



In [59]:
from pyspark.sql.functions import when

In [61]:
mpg.select(
    'cty',
    when(col('cty') >= 20, 'good gas mileage').otherwise('bad gas mileage')
).show()

+---+--------------------------------------------------------------------+
|cty|CASE WHEN (cty >= 20) THEN good gas mileage ELSE bad gas mileage END|
+---+--------------------------------------------------------------------+
| 18|                                                     bad gas mileage|
| 21|                                                    good gas mileage|
| 20|                                                    good gas mileage|
| 21|                                                    good gas mileage|
| 16|                                                     bad gas mileage|
| 18|                                                     bad gas mileage|
| 18|                                                     bad gas mileage|
| 18|                                                     bad gas mileage|
| 16|                                                     bad gas mileage|
| 20|                                                    good gas mileage|
| 19|                    

In [68]:
from pyspark.sql.functions import expr

In [71]:
mpg.select(
    mpg.displ,
    (
        when(expr('displ < 2'), "small")
        .when(mpg.displ < 3, "medium")
        .otherwise("large")
        .alias("engine_size")
    ),
).show()

+-----+-----------+
|displ|engine_size|
+-----+-----------+
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  3.1|      large|
|  2.8|     medium|
|  3.1|      large|
|  4.2|      large|
|  5.3|      large|
|  5.3|      large|
+-----+-----------+
only showing top 20 rows



In [78]:
from pyspark.sql.functions import desc, asc

In [76]:
# sort by hwy, asc, desc, method + fn
# mpg.sort('hwy').show()
mpg.orderBy('hwy').show(5)

+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|              model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|        jeep| grand cherokee 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge|        durango 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|  dakota pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
only showing top 5 rows



In [80]:
mpg.orderBy(asc('hwy')).show(5)

+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|              model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|        jeep| grand cherokee 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge|        durango 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|  dakota pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
only showing top 5 rows



In [83]:
mpg.hwy.desc_nulls_first()

Column<b'hwy DESC NULLS FIRST'>

In [81]:
mpg.sort(mpg.hwy.desc()).show()

+------------+------------+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|       model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+------------+-----+----+---+----------+---+---+---+---+----------+
|  volkswagen|       jetta|  1.9|1999|  4|manual(m5)|  f| 33| 44|  d|   compact|
|  volkswagen|  new beetle|  1.9|1999|  4|manual(m5)|  f| 35| 44|  d|subcompact|
|  volkswagen|  new beetle|  1.9|1999|  4|  auto(l4)|  f| 29| 41|  d|subcompact|
|      toyota|     corolla|  1.8|2008|  4|manual(m5)|  f| 28| 37|  r|   compact|
|       honda|       civic|  1.8|2008|  4|  auto(l5)|  f| 25| 36|  r|subcompact|
|       honda|       civic|  1.8|2008|  4|  auto(l5)|  f| 24| 36|  c|subcompact|
|      toyota|     corolla|  1.8|1999|  4|manual(m5)|  f| 26| 35|  r|   compact|
|      toyota|     corolla|  1.8|2008|  4|  auto(l4)|  f| 26| 35|  r|   compact|
|       honda|       civic|  1.8|2008|  4|manual(m5)|  f| 26| 34|  r|subcompact|
|      toyota|     corolla| 

In [84]:
mpg.sort(desc("class"), mpg.cyl.asc(), col("hwy").desc()).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 20| 27|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 20| 26|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 19| 25|  p|  suv|
|      subaru|      forester awd|  2.5|1999|  4|manual(m5)|  4| 18| 25|  r|  suv|
|      subaru|      forester awd|  2.5|1999|  4|  auto(l4)|  4| 18| 24|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 18| 23|  p|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|  auto(l4)|  4| 16| 20|  r|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|manual(m5)|  4| 15| 20|  r|  suv|
|        jeep|grand cherokee 4wd|  3.0|2008|  6|  auto(l5)|  4| 17| 22|  d|  suv|
|        jeep|gr

In [None]:
# group by cyl, class, agg hwy, cty, rollup, pivot

In [90]:
mpg.groupBy('cyl').mean().show()

+---+------------------+------------------+--------+------------------+-----------------+
|cyl|        avg(displ)|         avg(year)|avg(cyl)|          avg(cty)|         avg(hwy)|
+---+------------------+------------------+--------+------------------+-----------------+
|  6| 3.408860759493671| 2002.873417721519|     6.0| 16.21518987341772|22.82278481012658|
|  5|               2.5|            2008.0|     5.0|              20.5|            28.75|
|  8|5.1328571428571435|2004.5285714285715|     8.0|12.571428571428571|17.62857142857143|
|  4| 2.145679012345679|            2003.0|     4.0|21.012345679012345|28.80246913580247|
+---+------------------+------------------+--------+------------------+-----------------+



In [93]:
mpg.groupBy('cyl', 'class').mean().show()

+---+----------+------------------+------------------+--------+------------------+------------------+
|cyl|     class|        avg(displ)|         avg(year)|avg(cyl)|          avg(cty)|          avg(hwy)|
+---+----------+------------------+------------------+--------+------------------+------------------+
|  5|   compact|               2.5|            2008.0|     5.0|              21.0|              29.0|
|  5|subcompact|               2.5|            2008.0|     5.0|              20.0|              28.5|
|  6|subcompact| 3.385714285714286|2005.4285714285713|     6.0|              17.0|24.714285714285715|
|  6|    pickup|3.8400000000000007|            2002.6|     6.0|              14.5|              17.9|
|  4|subcompact|1.9333333333333333|2001.5714285714287|     4.0|22.857142857142858| 30.80952380952381|
|  8|       suv| 5.155263157894736|2004.4473684210527|     8.0|12.131578947368421|16.789473684210527|
|  8|    pickup| 4.965000000000001|            2004.4|     8.0|              11.8|

In [102]:
from pyspark.sql.functions import mean, max, min, count

In [101]:
mpg.groupBy('class').count().show()

+----------+-----+
|     class|count|
+----------+-----+
|subcompact|   35|
|   compact|   47|
|   minivan|   11|
|       suv|   62|
|   midsize|   41|
|    pickup|   33|
|   2seater|    5|
+----------+-----+



In [105]:
spark.sql('SELECT class, COUNT(*) FROM mpg GROUP BY class').show()

+----------+--------+
|     class|count(1)|
+----------+--------+
|subcompact|      35|
|   compact|      47|
|   minivan|      11|
|       suv|      62|
|   midsize|      41|
|    pickup|      33|
|   2seater|       5|
+----------+--------+



In [103]:
mpg.groupBy('class').agg(count('*')).show()

+----------+--------+
|     class|count(1)|
+----------+--------+
|subcompact|      35|
|   compact|      47|
|   minivan|      11|
|       suv|      62|
|   midsize|      41|
|    pickup|      33|
|   2seater|       5|
+----------+--------+



In [99]:
mpg.groupBy('cyl', 'class').agg(
    mean('hwy').alias('avg_highway_mileage'),
    mean('cty'),
    max('displ'),
    min('displ')
).show()

+---+----------+-------------------+------------------+----------+----------+
|cyl|     class|avg_highway_mileage|          avg(cty)|max(displ)|min(displ)|
+---+----------+-------------------+------------------+----------+----------+
|  5|   compact|               29.0|              21.0|       2.5|       2.5|
|  5|subcompact|               28.5|              20.0|       2.5|       2.5|
|  6|subcompact| 24.714285714285715|              17.0|       4.0|       2.7|
|  6|    pickup|               17.9|              14.5|       4.2|       3.4|
|  4|subcompact|  30.80952380952381|22.857142857142858|       2.5|       1.6|
|  8|       suv| 16.789473684210527|12.131578947368421|       6.5|       4.0|
|  8|    pickup|               15.8|              11.8|       5.9|       4.6|
|  8|   midsize|               24.0|              16.0|       5.3|       4.2|
|  4|   midsize|            29.1875|              20.5|       2.5|       1.8|
|  8|   2seater|               24.8|              15.4|       7.

In [109]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [107]:
# groupBy(rows).pivot(columns).agg
mpg.groupBy('class').pivot('cyl').count().show()

+----------+----+----+----+----+
|     class|   4|   5|   6|   8|
+----------+----+----+----+----+
|subcompact|  21|   2|   7|   5|
|   compact|  32|   2|  13|null|
|   minivan|   1|null|  10|null|
|       suv|   8|null|  16|  38|
|   midsize|  16|null|  23|   2|
|    pickup|   3|null|  10|  20|
|   2seater|null|null|null|   5|
+----------+----+----+----+----+



In [108]:
mpg.groupBy('class').pivot('cyl').agg(mean('hwy')).show()

+----------+------------------+----+------------------+------------------+
|     class|                 4|   5|                 6|                 8|
+----------+------------------+----+------------------+------------------+
|subcompact| 30.80952380952381|28.5|24.714285714285715|              21.6|
|   compact|          29.46875|29.0|25.307692307692307|              null|
|   minivan|              24.0|null|              22.2|              null|
|       suv|             23.75|null|              18.5|16.789473684210527|
|   midsize|           29.1875|null| 26.26086956521739|              24.0|
|    pickup|20.666666666666668|null|              17.9|              15.8|
|   2seater|              null|null|              null|              24.8|
+----------+------------------+----+------------------+------------------+



In [114]:
mpg.filter(col('cty') > 20).select('cyl', 'hwy').sort('hwy').show()

+---+---+
|cyl|hwy|
+---+---+
|  4| 26|
|  4| 27|
|  4| 27|
|  4| 29|
|  5| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  5| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
+---+---+
only showing top 20 rows



In [116]:
mpg.sort('hwy').select('cyl', 'hwy').filter(col('cty') > 20).show()

+---+---+
|cyl|hwy|
+---+---+
|  4| 26|
|  4| 27|
|  4| 27|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  5| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  5| 29|
|  4| 29|
|  4| 29|
|  4| 29|
|  4| 29|
+---+---+
only showing top 20 rows



In [113]:
mpg.groupby('manufacturer').pivot('drv').count().sort('manufacturer').show()

+------------+----+----+----+
|manufacturer|   4|   f|   r|
+------------+----+----+----+
|        audi|  11|   7|null|
|   chevrolet|   4|   5|  10|
|       dodge|  26|  11|null|
|        ford|  13|null|  12|
|       honda|null|   9|null|
|     hyundai|null|  14|null|
|        jeep|   8|null|null|
|  land rover|   4|null|null|
|     lincoln|null|null|   3|
|     mercury|   4|null|null|
|      nissan|   4|   9|null|
|     pontiac|null|   5|null|
|      subaru|  14|null|null|
|      toyota|  15|  19|null|
|  volkswagen|null|  27|null|
+------------+----+----+----+



In [120]:
mpg.rollup('cyl', 'class').agg(mean('hwy')).sort('cyl', 'class').show()

+----+----------+------------------+
| cyl|     class|          avg(hwy)|
+----+----------+------------------+
|null|      null| 23.44017094017094|
|   4|      null| 28.80246913580247|
|   4|   compact|          29.46875|
|   4|   midsize|           29.1875|
|   4|   minivan|              24.0|
|   4|    pickup|20.666666666666668|
|   4|subcompact| 30.80952380952381|
|   4|       suv|             23.75|
|   5|      null|             28.75|
|   5|   compact|              29.0|
|   5|subcompact|              28.5|
|   6|      null| 22.82278481012658|
|   6|   compact|25.307692307692307|
|   6|   midsize| 26.26086956521739|
|   6|   minivan|              22.2|
|   6|    pickup|              17.9|
|   6|subcompact|24.714285714285715|
|   6|       suv|              18.5|
|   8|      null| 17.62857142857143|
|   8|   2seater|              24.8|
+----+----------+------------------+
only showing top 20 rows

