## Lesson

In [245]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
import pandas as pd
from pydataset import data
from pyspark.sql.functions import min,max,avg,sum,concat, lit, round
from pyspark.sql.functions import when

In [21]:
mpg = spark.createDataFrame(data('mpg'))
mpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [23]:
# can reference columns in different ways
mpg.select('manufacturer', mpg.cty, mpg.hwy.alias('hwy_mileage')).show(5)

+------------+---+-----------+
|manufacturer|cty|hwy_mileage|
+------------+---+-----------+
|        audi| 18|         29|
|        audi| 21|         29|
|        audi| 20|         31|
|        audi| 21|         30|
|        audi| 16|         26|
+------------+---+-----------+
only showing top 5 rows



In [20]:
#show first 5
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



## Spark API Mini Exercises


Copy the code below to create a pandas dataframe with 20 rows and 3 columns:

In [82]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

### I. Spark Dataframe Basics:

1) Use the starter code above to create a pandas dataframe.

In [83]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


2) Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [84]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

3) Show the first 3 rows of the dataframe.

In [85]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



4) Show the first 7 rows of the dataframe.

In [86]:
df.select('*').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



5) View a summary of the data using .describe.

In [87]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



6) Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [88]:
n_bool = df.select('n','abool')
n_bool.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



7) Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [89]:
group_bool = df.drop('n')
group_bool.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



8) Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [90]:
group_bool_new = df.select('group',df.abool.alias('a_boolean_value'))
group_bool_new.show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



9) Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [91]:
group_n = df.select(df.group, df.n.alias('a_numeric_value'))
group_n.show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



### II. Column Manipulation:

1) Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

In [51]:
df = spark.createDataFrame(pandas_dataframe)

2) Use .select to add 4 to the n column. Show the results.

In [55]:
plus_4 = df.n + 4
df.select(plus_4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



3) Subtract 5 from the n column and view the results.

In [57]:
minus_5 = df.n - 5
df.select(minus_5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



4) Multiply the n column by 2. View the results along with the original numbers.

In [94]:
times_2 = df.n * 2
df.select(df.n, times_2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



5) Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [95]:
n2 = df.n * (-1)
df = df.select('*', n2.alias('n2'))
df.show(4)

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|
|   0.753766378659703|    x|false|  -0.753766378659703|
|-0.04450307833805...|    z|false|0.044503078338053455|
| 0.45181233874578974|    y|false|-0.45181233874578974|
+--------------------+-----+-----+--------------------+
only showing top 4 rows



6) Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [96]:
n3 = df.n ** 2
df = df.select('*', n3.alias('n3'))
df.show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



7) What happens when you run the code below?

> `df.group + df.abool`

In [72]:
df.group+df.abool

Column<b'(group + abool)'>

8) What happens when you run the code below? What is the difference between this and the previous code sample?

> `df.select(df.group + df.abool)`

In [73]:
df.select(df.group + df.abool)

AnalysisException: "cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;;\n'Project [(cast(group#740 as double) + abool#741) AS (group + abool)#857]\n+- Project [n#739, group#740, abool#741, n2#817, POWER(n#739, cast(2 as double)) AS n3#835]\n   +- Project [n#739, group#740, abool#741, (n#739 * cast(-1 as double)) AS n2#817]\n      +- LogicalRDD [n#739, group#740, abool#741], false\n"

9) Try adding various other columns together. What are the results of combining the different data types?

In [97]:
df.select(df.n * df.n3).show(5)

+--------------------+
|            (n * n3)|
+--------------------+
| -0.3615385853969069|
|  0.4282627350350894|
|-8.81394139018882...|
|  0.0922304359126587|
|   2.433690646133313|
+--------------------+
only showing top 5 rows



### III. Spark SQL

1) Use the starter code above to re-create a spark dataframe.

In [98]:
df = spark.createDataFrame(pandas_dataframe)

2) Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.

In [101]:
df.createOrReplaceTempView('my_df')

3) Write a query that shows all of the columns from your dataframe.

In [103]:
spark.sql('''
SELECT *
FROM my_df
''').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



4) Write a query that shows just the n and abool columns from the dataframe.

In [104]:
spark.sql('''
SELECT n, abool
FROM my_df
''').show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



5) Write a query that shows just the n and group columns. Rename the group column to g.

In [107]:
spark.sql('''
    SELECT n, group as g
    FROM my_df
''').show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



6) Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.

In [109]:
spark.sql('''
    SELECT n, (n/2) AS n2, (n-1) AS n3
    from my_df
''').show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
+--------------------+--------------------+--------------------+
only showing top 7 rows



7) What happens if you make a SQL syntax error in your query?

In [117]:
#lets see
spark.sql('''
SELECT n * group
FROM my_df
''').show(7)

+---------------------------+
|(n * CAST(group AS DOUBLE))|
+---------------------------+
|                       null|
|                       null|
|                       null|
|                       null|
|                       null|
|                       null|
|                       null|
+---------------------------+
only showing top 7 rows



### IV. Type casting

1) Use the starter code above to re-create a spark dataframe.

In [118]:
df = spark.createDataFrame(pandas_dataframe)

2) Use .printSchema to view the datatypes in your dataframe.

In [120]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



3) Use .dtypes to view the datatypes in your dataframe.

In [122]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

4) What is the difference between the two code samples below?

>`df.abool.cast('int')`

>`df.select(df.abool.cast('int')).show()`

the first defines a transformation but does not actually apply any action

5) Use .select and .cast to convert the abool column to an integer type. View the results.

In [127]:
df.select(df.abool.cast('int')).show(7)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 7 rows



6) Convert the group column to a integer data type and view the results. What happens?

In [129]:
int_group = df.group.cast('int')
int_group

Column<b'CAST(group AS INT)'>

In [130]:
df.select(int_group).show(7)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 7 rows



7) Convert the n column to a integer data type and view the results. What happens?

In [131]:
int_n = df.n.cast('int')
int_n

Column<b'CAST(n AS INT)'>

In [133]:
df.select(int_n).show(7)

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
|  0|
|  1|
+---+
only showing top 7 rows



8) Convert the abool column to a string data type and view the results. What happens?

In [136]:
str_abool = df.abool.cast('string')
str_abool

Column<b'CAST(abool AS STRING)'>

In [137]:
df.select(str_abool).show(7)

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
|false|
|false|
+-----+
only showing top 7 rows



### V. Built-in Functions

1) Use the starter code above to re-create a spark dataframe.

In [138]:
df = spark.createDataFrame(pandas_dataframe)

2) Import the necessary functions from pyspark.sql.functions

In [156]:
from pyspark.sql.functions import min,max,avg,sum,concat, lit, round

3) Find the highest n value.

In [141]:
df.select(max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



4) Find the lowest n value.

In [142]:
df.select(min('n')).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



5) Find the average n value.

In [143]:
df.select(avg('n')).show()

+-------------------+
|             avg(n)|
+-------------------+
|0.36640264498852165|
+-------------------+



6) Use concat to change the group column to say, e.g. "Group: x" or "Group: y"

In [158]:
df.select(concat(lit('Group: '), 'group')).show(7)

+----------------------+
|concat(Group: , group)|
+----------------------+
|              Group: z|
|              Group: x|
|              Group: z|
|              Group: y|
|              Group: z|
|              Group: y|
|              Group: z|
+----------------------+
only showing top 7 rows



7) Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [247]:
new_col = concat(df.group, lit(': '), round(df.n, 3)).alias('group_n')
df.select(new_col).show(7)

+---------+
|  group_n|
+---------+
|z: -0.712|
| x: 0.754|
|z: -0.045|
| y: 0.452|
| z: 1.345|
| y: 0.532|
|  z: 1.35|
+---------+
only showing top 7 rows



### VI. Filter / Where

1) Use the starter code above to re-create a spark dataframe.

In [169]:
df = spark.createDataFrame(pandas_dataframe)

2) Use .filter or .where to select just the rows where the group is y and view the results.

In [176]:
df.filter(df.group == 'y').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
+--------------------+-----+-----+
only showing top 7 rows



3) Select just the columns where the abool column is false and view the results.

In [175]:
df.filter(~ df.abool).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



4) Find the columns where the group column is not y.

In [181]:
df.filter(~ (df.group == 'y')).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+
only showing top 7 rows



5) Find the columns where n is positive.

In [183]:
df.filter(df.n > 0).show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
| 0.8612113741693206|    x|false|
| 1.4786857374358966|    z| true|
+-------------------+-----+-----+
only showing top 7 rows



6) Find the columns where abool is true and the group column is z.

In [185]:
df.filter(df.abool & (df.group == 'z')).show(7)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



7) Find the columns where abool is true or the group column is z.

In [187]:
df.filter(df.abool | (df.group == 'z')).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
+--------------------+-----+-----+
only showing top 7 rows



8) Find the columns where abool is false and n is less than 1

In [188]:
df.filter(~df.abool & (df.n < 1)).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
+--------------------+-----+-----+
only showing top 7 rows



9) Find the columns where abool is false or n is less than 1

In [192]:
df.filter(~df.abool | (df.n < 1)).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



### VII. When / Otherwise

1) Use the starter code above to re-create a spark dataframe.

In [193]:
df = spark.createDataFrame(pandas_dataframe)

2) Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [209]:
from pyspark.sql.functions import when

df.select(df.abool, when(df.abool, 'It is True').otherwise('It is False').alias('when')).show(7)

+-----+-----------+
|abool|       when|
+-----+-----------+
|false|It is False|
|false|It is False|
|false|It is False|
|false|It is False|
|false|It is False|
|false|It is False|
|false|It is False|
+-----+-----------+
only showing top 7 rows



3) Create a column that contains 0 if n is less than 0, otherwise, the original n value.

In [214]:
df.select(df.n, when(df.n < 0, 0).otherwise(df.n).alias('no_negs')).show()

+--------------------+-------------------+
|                   n|            no_negs|
+--------------------+-------------------+
|  -0.712390662050588|                0.0|
|   0.753766378659703|  0.753766378659703|
|-0.04450307833805...|                0.0|
| 0.45181233874578974|0.45181233874578974|
|  1.3451017084510097| 1.3451017084510097|
|  0.5323378882945463| 0.5323378882945463|
|  1.3501878997225267| 1.3501878997225267|
|  0.8612113741693206| 0.8612113741693206|
|  1.4786857374358966| 1.4786857374358966|
| -1.0453771305385342|                0.0|
| -0.7889890249515489|                0.0|
|  -1.261605945319069|                0.0|
|  0.5628467852810314| 0.5628467852810314|
|-0.24332625188556253|                0.0|
|  0.9137407048596775| 0.9137407048596775|
| 0.31735092273633597|0.31735092273633597|
| 0.12730328020698067|0.12730328020698067|
|  2.1503829673811126| 2.1503829673811126|
|  0.6062886568962988| 0.6062886568962988|
|-0.02677164998644...|                0.0|
+----------

### VIII. Sorting

1) Use the starter code above to re-create a spark dataframe.

In [215]:
df = spark.createDataFrame(pandas_dataframe)

2) Sort by the n value.

In [222]:
df.sort('n').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+
only showing top 7 rows



3) Sort by the group value, both ascending and descending.

In [227]:
df.sort('group').show(7), df.sort('group', ascending = False).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.02677164998644...|    x| true|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
|  0.6062886568962988|    x|false|
| 0.31735092273633597|    x|false|
|  0.5628467852810314|    y| true|
+--------------------+-----+-----+
only showing top 7 rows

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.04450307833805...|    z|false|
|  1.4786857374358966|    z| true|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  -0.712390662050588|    z|false|
| 0.12730328020698067|    z|false|
|-0.24332625188556253|    y| true|
+--------------------+-----+-----+
only showing top 7 rows



(None, None)

4) Sort by the group value first, then, within each group, sort by n value.

In [232]:
df.sort('group','n').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
+--------------------+-----+-----+
only showing top 7 rows



5) Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

In [237]:
df.sort('abool','group','n').show(7), df.sort('abool','n','group').show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-0.7889890249515489|    x|false|
|0.31735092273633597|    x|false|
| 0.6062886568962988|    x|false|
|  0.753766378659703|    x|false|
| 0.8612113741693206|    x|false|
| -1.261605945319069|    y|false|
|0.45181233874578974|    y|false|
+-------------------+-----+-----+
only showing top 7 rows

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 7 rows



(None, None)

### IX. Aggregating

1) What is the average n value for each group in the group column?

In [239]:
df.groupBy('group').agg(avg('n')).show()

+-----+-------------------+
|group|             avg(n)|
+-----+-------------------+
|    x|0.28714277625394485|
|    z|  0.590730814237962|
|    y| 0.2576014196023739|
+-----+-------------------+



2) What is the maximum n value for each group in the group column?

In [244]:
df.groupBy('group').agg(max('n')).show()

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    x|0.8612113741693206|
|    z|1.4786857374358966|
|    y|2.1503829673811126|
+-----+------------------+



3) What is the minimum n value by abool?

In [251]:
df.groupBy('abool').agg(min('n')).show()

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
| true|-1.0453771305385342|
|false| -1.261605945319069|
+-----+-------------------+



4) What is the average n value for each unique combination of the group and abool column?

In [254]:
df.groupBy('abool','group').agg(avg('n')).show()

+-----+-----+--------------------+
|abool|group|              avg(n)|
+-----+-----+--------------------+
| true|    z|  1.4786857374358966|
| true|    x|-0.02677164998644...|
|false|    z| 0.41313982959837514|
|false|    y| 0.15907124664523611|
| true|    y| 0.35613159255951177|
|false|    x|   0.349925661502022|
+-----+-----+--------------------+



<hr></hr>

**Which car is the coolest?**

In [271]:
pd_df = data('mpg')
mpg = spark.createDataFrame(pd_df)
mpg.filter(mpg['class'] == 'minivan').show()

+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|       dodge|caravan 2wd|  2.4|1999|  4|auto(l3)|  f| 18| 24|  r|minivan|
|       dodge|caravan 2wd|  3.0|1999|  6|auto(l4)|  f| 17| 24|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|2008|  6|auto(l4)|  f| 17| 24|  r|minivan|
|       dodge|caravan 2wd|  3.3|2008|  6|auto(l4)|  f| 17| 24|  r|minivan|
|       dodge|caravan 2wd|  3.3|2008|  6|auto(l4)|  f| 11| 17|  e|minivan|
|       dodge|caravan 2wd|  3.8|1999|  6|auto(l4)|  f| 15| 22|  r|minivan|
|       dodge|caravan 2wd|  3.8|1999|  6|auto(l4)|  f| 15| 21|  r|minivan|
|       dodge|caravan 2wd|  3.8|2008|  6|auto(l6)|  f| 16| 23|  r|minivan|
|       dodge|caravan 2wd

Column<b'class'>