In [90]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql.functions import sum, mean, concat, lit, asc, desc, col, when, filter

np.random.seed(123)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [2]:
# define spark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
df = spark.createDataFrame(pandas_dataframe)


Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.
- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.
- Show the first 3 rows of the dataframe.
- Show the first 7 rows of the dataframe.
- View a summary of the data using .describe.
- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.
- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.
- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [4]:
df.show(3)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.0856306033005612|    y|false|
| 0.9973454465835858|    x| true|
|0.28297849805199204|    x| true|
+-------------------+-----+-----+
only showing top 3 rows



In [5]:
df.show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.0856306033005612|    y|false|
| 0.9973454465835858|    x| true|
|0.28297849805199204|    x| true|
| -1.506294713918092|    x|false|
|-0.5786002519685364|    z|false|
|  1.651436537097151|    y|false|
| -2.426679243393074|    y|false|
+-------------------+-----+-----+
only showing top 7 rows



In [6]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.11441773195529023| null|
| stddev|  1.257452681756012| null|
|    min| -2.426679243393074|    x|
|    max| 2.2059300827254558|    z|
+-------+-------------------+-----+



In [7]:
df.select(df.n, df.abool).show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
|-1.0856306033005612|false|
| 0.9973454465835858| true|
|0.28297849805199204| true|
| -1.506294713918092|false|
|-0.5786002519685364|false|
+-------------------+-----+
only showing top 5 rows



In [8]:
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    y|false|
|    x| true|
|    x| true|
|    x|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [9]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    y|          false|
|    x|           true|
|    x|           true|
+-----+---------------+
only showing top 3 rows



In [10]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+-------------------+
|group|    a_numeric_value|
+-----+-------------------+
|    y|-1.0856306033005612|
|    x| 0.9973454465835858|
|    x|0.28297849805199204|
|    x| -1.506294713918092|
|    z|-0.5786002519685364|
|    y|  1.651436537097151|
+-----+-------------------+
only showing top 6 rows



Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

- Use .select to add 4 to the n column. Show the results.

- Subtract 5 from the n column and view the results.

- Multiply the n column by 2. View the results along with the original numbers.

- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

- What happens when you run the code below?

- df.group + df.abool
- What happens when you run the code below? What is the difference between this and the previous code sample?

- df.select(df.group + df.abool)
- Try adding various other columns together. What are the results of combining the different data types?

In [11]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
| 2.914369396699439|
| 4.997345446583585|
|4.2829784980519925|
| 2.493705286081908|
|3.4213997480314635|
| 5.651436537097151|
|1.5733207566069258|
|3.5710873711438227|
| 5.265936258705534|
|3.1332595977348983|
|3.3211138483779457|
| 3.905291031063109|
| 5.491389626124288|
| 3.361098003315349|
|3.5560180403539343|
|3.5656487243814827|
| 6.205930082725455|
| 6.186786088973786|
| 5.004053897878877|
| 4.386186399174856|
+------------------+



In [12]:
df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -6.085630603300562|
| -4.002654553416415|
|-4.7170215019480075|
| -6.506294713918092|
|-5.5786002519685365|
| -3.348563462902849|
| -7.426679243393075|
| -5.428912628856177|
|-3.7340637412944657|
| -5.866740402265101|
| -5.678886151622054|
| -5.094708968936891|
| -3.508610373875712|
| -5.638901996684651|
| -5.443981959646066|
| -5.434351275618518|
|-2.7940699172745442|
|-2.8132139110262133|
| -3.995946102121123|
| -4.613813600825144|
+-------------------+



In [13]:
df.select(df.n, df.n * 2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
| -1.0856306033005612| -2.1712612066011223|
|  0.9973454465835858|  1.9946908931671716|
| 0.28297849805199204|  0.5659569961039841|
|  -1.506294713918092|  -3.012589427836184|
| -0.5786002519685364| -1.1572005039370727|
|   1.651436537097151|   3.302873074194302|
|  -2.426679243393074|  -4.853358486786148|
|-0.42891262885617726| -0.8578252577123545|
|   1.265936258705534|   2.531872517411068|
| -0.8667404022651017| -1.7334808045302035|
| -0.6788861516220543| -1.3577723032441087|
|-0.09470896893689112|-0.18941793787378225|
|  1.4913896261242878|  2.9827792522485757|
|  -0.638901996684651|  -1.277803993369302|
|-0.44398195964606546| -0.8879639192921309|
|-0.43435127561851733| -0.8687025512370347|
|  2.2059300827254558|  4.4118601654509115|
|  2.1867860889737867|   4.373572177947573|
|   1.004053897878877|   2.008107795757754|
|   0.386186399174856|   0.77237

In [14]:
n2 = (df.n * -1).alias('n2')

df = df.select('*', n2)

df.show(5)

+-------------------+-----+-----+--------------------+
|                  n|group|abool|                  n2|
+-------------------+-----+-----+--------------------+
|-1.0856306033005612|    y|false|  1.0856306033005612|
| 0.9973454465835858|    x| true| -0.9973454465835858|
|0.28297849805199204|    x| true|-0.28297849805199204|
| -1.506294713918092|    x|false|   1.506294713918092|
|-0.5786002519685364|    z|false|  0.5786002519685364|
+-------------------+-----+-----+--------------------+
only showing top 5 rows



In [15]:
n3 = (df.n ** 2).alias('n3')

df = df.select('*', n3)

df.show(5)

+-------------------+-----+-----+--------------------+-------------------+
|                  n|group|abool|                  n2|                 n3|
+-------------------+-----+-----+--------------------+-------------------+
|-1.0856306033005612|    y|false|  1.0856306033005612| 1.1785938068227404|
| 0.9973454465835858|    x| true| -0.9973454465835858| 0.9946979398210122|
|0.28297849805199204|    x| true|-0.28297849805199204|0.08007683035976126|
| -1.506294713918092|    x|false|   1.506294713918092| 2.2689237651775866|
|-0.5786002519685364|    z|false|  0.5786002519685364| 0.3347782515780538|
+-------------------+-----+-----+--------------------+-------------------+
only showing top 5 rows



In [16]:
df.group + df.abool # see a column object with no values

Column<'(group + abool)'>

-----------

## Type casting

- Use the starter code above to re-create a spark dataframe.

- Use .printSchema to view the datatypes in your dataframe.

- Use .dtypes to view the datatypes in your dataframe.

- What is the difference between the two code samples below?

    - df.abool.cast('int')
    - df.select(df.abool.cast('int')).show()
    - Use .select and .cast to convert the abool column to an integer type. View the results.


- Convert the group column to a integer data type and view the results. What happens?

- Convert the n column to a integer data type and view the results. What happens?

- Convert the abool column to a string data type and view the results. What happens?

------

In [18]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)



In [21]:
df.dtypes

[('n', 'double'),
 ('group', 'string'),
 ('abool', 'boolean'),
 ('n2', 'double'),
 ('n3', 'double')]

In [29]:
df.select(df.group.cast("int")).show(5) #null values since this column were strings, round to nearest whole number

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



In [31]:
df.select(df.n.cast("int")).show(5) #covnerted from a float to an int

+---+
|  n|
+---+
| -1|
|  0|
|  0|
| -1|
|  0|
+---+
only showing top 5 rows



In [33]:
df.abool.cast('int').show(5)

TypeError: 'Column' object is not callable

In [34]:
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    1|
|    1|
|    0|
|    0|
+-----+
only showing top 5 rows



---------

## Built-in Functions

- Use the starter code above to re-create a spark dataframe.
- Import the necessary functions from pyspark.sql.functions
- Find the highest n value.
- Find the lowest n value.
- Find the average n value.
- Use concat to change the group column to say, e.g. "Group: x" or "Group: y"
- Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

-------

In [40]:
df.sort(desc(df.n)).show(5)

+------------------+-----+-----+-------------------+------------------+
|                 n|group|abool|                 n2|                n3|
+------------------+-----+-----+-------------------+------------------+
|2.2059300827254558|    z| true|-2.2059300827254558| 4.866127529873136|
|2.1867860889737867|    z|false|-2.1867860889737867|  4.78203339892927|
| 1.651436537097151|    y|false| -1.651436537097151|  2.72724263605943|
|1.4913896261242878|    x|false|-1.4913896261242878| 2.224243016911143|
| 1.265936258705534|    z| true| -1.265936258705534|1.6025946111053648|
+------------------+-----+-----+-------------------+------------------+
only showing top 5 rows



In [41]:
df.sort(asc(df.n)).show(5)

+-------------------+-----+-----+------------------+-------------------+
|                  n|group|abool|                n2|                 n3|
+-------------------+-----+-----+------------------+-------------------+
| -2.426679243393074|    y|false| 2.426679243393074|  5.888772150314783|
| -1.506294713918092|    x|false| 1.506294713918092| 2.2689237651775866|
|-1.0856306033005612|    y|false|1.0856306033005612| 1.1785938068227404|
|-0.8667404022651017|    z| true|0.8667404022651017| 0.7512389249186704|
|-0.6788861516220543|    y| true|0.6788861516220543|0.46088640686420296|
+-------------------+-----+-----+------------------+-------------------+
only showing top 5 rows



In [44]:
df.select(mean('n')).show()

+-------------------+
|             avg(n)|
+-------------------+
|0.11441773195529023|
+-------------------+



In [73]:
df.select(df.group.alias('Group: x or Group: y')).show()

+--------------------+
|Group: x or Group: y|
+--------------------+
|                   y|
|                   x|
|                   x|
|                   x|
|                   z|
|                   y|
|                   y|
|                   z|
|                   z|
|                   z|
|                   y|
|                   x|
|                   x|
|                   z|
|                   y|
|                   x|
|                   z|
|                   z|
|                   z|
|                   z|
+--------------------+



In [74]:
df.select(concat(df.group, df.n)).show(5)

+--------------------+
|    concat(group, n)|
+--------------------+
|y-1.0856306033005612|
| x0.9973454465835858|
|x0.28297849805199204|
| x-1.506294713918092|
|z-0.5786002519685364|
+--------------------+
only showing top 5 rows



----------

## When / Otherwise

- Use the starter code above to re-create a spark dataframe.
- Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.
- Create a column that contains 0 if n is less than 0, otherwise, the original n value.

--------

In [79]:
df.select(df.abool, when(df.abool == "true", "It is true").otherwise('It is false').alias("abool_new")).show(
 5
)

+-----+-----------+
|abool|  abool_new|
+-----+-----------+
|false|It is false|
| true| It is true|
| true| It is true|
|false|It is false|
|false|It is false|
+-----+-----------+
only showing top 5 rows



In [81]:
df.select(df.n, when(df.n < 0, '0').otherwise(df.n).alias('new_n')).show(5)

+-------------------+-------------------+
|                  n|              new_n|
+-------------------+-------------------+
|-1.0856306033005612|                  0|
| 0.9973454465835858| 0.9973454465835858|
|0.28297849805199204|0.28297849805199204|
| -1.506294713918092|                  0|
|-0.5786002519685364|                  0|
+-------------------+-------------------+
only showing top 5 rows



-----------

## Filter / Where

- Use the starter code above to re-create a spark dataframe.
- Use .filter or .where to select just the rows where the group is y and view the results.
- Select just the columns where the abool column is false and view the results.
- Find the columns where the group column is not y.
- Find the columns where n is positive.
- Find the columns where abool is true and the group column is z.
- Find the columns where abool is true or the group column is z.
- Find the columns where abool is false and n is less than 1
- Find the columns where abool is false or n is less than 1

-------

In [82]:
df.where(df.group == 'y').show(5)

+--------------------+-----+-----+-------------------+-------------------+
|                   n|group|abool|                 n2|                 n3|
+--------------------+-----+-----+-------------------+-------------------+
| -1.0856306033005612|    y|false| 1.0856306033005612| 1.1785938068227404|
|   1.651436537097151|    y|false| -1.651436537097151|   2.72724263605943|
|  -2.426679243393074|    y|false|  2.426679243393074|  5.888772150314783|
| -0.6788861516220543|    y| true| 0.6788861516220543|0.46088640686420296|
|-0.44398195964606546|    y| true|0.44398195964606546| 0.1971199804911605|
+--------------------+-----+-----+-------------------+-------------------+



In [83]:
df.where(df.abool == 'false').show(5)

+-------------------+-----+-----+------------------+------------------+
|                  n|group|abool|                n2|                n3|
+-------------------+-----+-----+------------------+------------------+
|-1.0856306033005612|    y|false|1.0856306033005612|1.1785938068227404|
| -1.506294713918092|    x|false| 1.506294713918092|2.2689237651775866|
|-0.5786002519685364|    z|false|0.5786002519685364|0.3347782515780538|
|  1.651436537097151|    y|false|-1.651436537097151|  2.72724263605943|
| -2.426679243393074|    y|false| 2.426679243393074| 5.888772150314783|
+-------------------+-----+-----+------------------+------------------+
only showing top 5 rows



In [84]:
df.where(df.group != 'y').show(5)

+--------------------+-----+-----+--------------------+-------------------+
|                   n|group|abool|                  n2|                 n3|
+--------------------+-----+-----+--------------------+-------------------+
|  0.9973454465835858|    x| true| -0.9973454465835858| 0.9946979398210122|
| 0.28297849805199204|    x| true|-0.28297849805199204|0.08007683035976126|
|  -1.506294713918092|    x|false|   1.506294713918092| 2.2689237651775866|
| -0.5786002519685364|    z|false|  0.5786002519685364| 0.3347782515780538|
|-0.42891262885617726|    z| true| 0.42891262885617726|0.18396604319231685|
+--------------------+-----+-----+--------------------+-------------------+
only showing top 5 rows



In [85]:
df.where(df.n >= 0).show(4)

+-------------------+-----+-----+--------------------+-------------------+
|                  n|group|abool|                  n2|                 n3|
+-------------------+-----+-----+--------------------+-------------------+
| 0.9973454465835858|    x| true| -0.9973454465835858| 0.9946979398210122|
|0.28297849805199204|    x| true|-0.28297849805199204|0.08007683035976126|
|  1.651436537097151|    y|false|  -1.651436537097151|   2.72724263605943|
|  1.265936258705534|    z| true|  -1.265936258705534| 1.6025946111053648|
+-------------------+-----+-----+--------------------+-------------------+
only showing top 4 rows



In [86]:
df.where(df.group == 'z').where(df.abool == 'true').show(5)

+--------------------+-----+-----+-------------------+-------------------+
|                   n|group|abool|                 n2|                 n3|
+--------------------+-----+-----+-------------------+-------------------+
|-0.42891262885617726|    z| true|0.42891262885617726|0.18396604319231685|
|   1.265936258705534|    z| true| -1.265936258705534| 1.6025946111053648|
| -0.8667404022651017|    z| true| 0.8667404022651017| 0.7512389249186704|
|  -0.638901996684651|    z| true|  0.638901996684651|0.40819576136763375|
|  2.2059300827254558|    z| true|-2.2059300827254558|  4.866127529873136|
+--------------------+-----+-----+-------------------+-------------------+
only showing top 5 rows



In [94]:
df.filter((df.abool == 'true') | (df.group == 'z')).show(5)

+--------------------+-----+-----+--------------------+-------------------+
|                   n|group|abool|                  n2|                 n3|
+--------------------+-----+-----+--------------------+-------------------+
|  0.9973454465835858|    x| true| -0.9973454465835858| 0.9946979398210122|
| 0.28297849805199204|    x| true|-0.28297849805199204|0.08007683035976126|
| -0.5786002519685364|    z|false|  0.5786002519685364| 0.3347782515780538|
|-0.42891262885617726|    z| true| 0.42891262885617726|0.18396604319231685|
|   1.265936258705534|    z| true|  -1.265936258705534| 1.6025946111053648|
+--------------------+-----+-----+--------------------+-------------------+
only showing top 5 rows



In [95]:
df.where(df.abool == 'false').where(df.n < 1).show(5)

+--------------------+-----+-----+-------------------+--------------------+
|                   n|group|abool|                 n2|                  n3|
+--------------------+-----+-----+-------------------+--------------------+
| -1.0856306033005612|    y|false| 1.0856306033005612|  1.1785938068227404|
|  -1.506294713918092|    x|false|  1.506294713918092|  2.2689237651775866|
| -0.5786002519685364|    z|false| 0.5786002519685364|  0.3347782515780538|
|  -2.426679243393074|    y|false|  2.426679243393074|   5.888772150314783|
|-0.09470896893689112|    x|false|0.09470896893689112|0.008969788797089009|
+--------------------+-----+-----+-------------------+--------------------+
only showing top 5 rows



In [96]:
df.filter((df.abool == 'false') | (df.n < 1)).show(5)

+-------------------+-----+-----+--------------------+-------------------+
|                  n|group|abool|                  n2|                 n3|
+-------------------+-----+-----+--------------------+-------------------+
|-1.0856306033005612|    y|false|  1.0856306033005612| 1.1785938068227404|
| 0.9973454465835858|    x| true| -0.9973454465835858| 0.9946979398210122|
|0.28297849805199204|    x| true|-0.28297849805199204|0.08007683035976126|
| -1.506294713918092|    x|false|   1.506294713918092| 2.2689237651775866|
|-0.5786002519685364|    z|false|  0.5786002519685364| 0.3347782515780538|
+-------------------+-----+-----+--------------------+-------------------+
only showing top 5 rows



--------

## Sorting

- Use the starter code above to re-create a spark dataframe.
- Sort by the n value.
- Sort by the group value, both ascending and descending.
- Sort by the group value first, then, within each group, sort by n value.
- Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

-------

In [98]:
df.sort(df.n).show(5)

+-------------------+-----+-----+------------------+-------------------+
|                  n|group|abool|                n2|                 n3|
+-------------------+-----+-----+------------------+-------------------+
| -2.426679243393074|    y|false| 2.426679243393074|  5.888772150314783|
| -1.506294713918092|    x|false| 1.506294713918092| 2.2689237651775866|
|-1.0856306033005612|    y|false|1.0856306033005612| 1.1785938068227404|
|-0.8667404022651017|    z| true|0.8667404022651017| 0.7512389249186704|
|-0.6788861516220543|    y| true|0.6788861516220543|0.46088640686420296|
+-------------------+-----+-----+------------------+-------------------+
only showing top 5 rows



In [99]:
df.sort(asc(df.group)).show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|-0.43435127561851733|    x| true| 0.43435127561851733| 0.18866103063143322|
|  0.9973454465835858|    x| true| -0.9973454465835858|  0.9946979398210122|
|-0.09470896893689112|    x|false| 0.09470896893689112|0.008969788797089009|
|  -1.506294713918092|    x|false|   1.506294713918092|  2.2689237651775866|
| 0.28297849805199204|    x| true|-0.28297849805199204| 0.08007683035976126|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



In [100]:
df.sort(desc(df.group)).show(5)

+-------------------+-----+-----+-------------------+-------------------+
|                  n|group|abool|                 n2|                 n3|
+-------------------+-----+-----+-------------------+-------------------+
|  1.004053897878877|    z| true| -1.004053897878877| 1.0081242298457667|
| 2.1867860889737867|    z|false|-2.1867860889737867|   4.78203339892927|
|-0.5786002519685364|    z|false| 0.5786002519685364| 0.3347782515780538|
| -0.638901996684651|    z| true|  0.638901996684651|0.40819576136763375|
| 2.2059300827254558|    z| true|-2.2059300827254558|  4.866127529873136|
+-------------------+-----+-----+-------------------+-------------------+
only showing top 5 rows



In [101]:
df.sort(df.group, df.n).show(10)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -1.506294713918092|    x|false|   1.506294713918092|  2.2689237651775866|
|-0.43435127561851733|    x| true| 0.43435127561851733| 0.18866103063143322|
|-0.09470896893689112|    x|false| 0.09470896893689112|0.008969788797089009|
| 0.28297849805199204|    x| true|-0.28297849805199204| 0.08007683035976126|
|  0.9973454465835858|    x| true| -0.9973454465835858|  0.9946979398210122|
|  1.4913896261242878|    x|false| -1.4913896261242878|   2.224243016911143|
|  -2.426679243393074|    y|false|   2.426679243393074|   5.888772150314783|
| -1.0856306033005612|    y|false|  1.0856306033005612|  1.1785938068227404|
| -0.6788861516220543|    y| true|  0.6788861516220543| 0.46088640686420296|
|-0.44398195964606546|    y| true| 0.44398195964606546|  0.1971199804911605|

In [102]:
df.sort('abool', 'group', 'n').show(10)

+--------------------+-----+-----+-------------------+--------------------+
|                   n|group|abool|                 n2|                  n3|
+--------------------+-----+-----+-------------------+--------------------+
|  -1.506294713918092|    x|false|  1.506294713918092|  2.2689237651775866|
|-0.09470896893689112|    x|false|0.09470896893689112|0.008969788797089009|
|  1.4913896261242878|    x|false|-1.4913896261242878|   2.224243016911143|
|  -2.426679243393074|    y|false|  2.426679243393074|   5.888772150314783|
| -1.0856306033005612|    y|false| 1.0856306033005612|  1.1785938068227404|
|   1.651436537097151|    y|false| -1.651436537097151|    2.72724263605943|
| -0.5786002519685364|    z|false| 0.5786002519685364|  0.3347782515780538|
|   0.386186399174856|    z|false| -0.386186399174856| 0.14913993490764121|
|  2.1867860889737867|    z|false|-2.1867860889737867|    4.78203339892927|
|-0.43435127561851733|    x| true|0.43435127561851733| 0.18866103063143322|
+-----------

In [103]:
df.sort('group', 'abool', 'n').show(10)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -1.506294713918092|    x|false|   1.506294713918092|  2.2689237651775866|
|-0.09470896893689112|    x|false| 0.09470896893689112|0.008969788797089009|
|  1.4913896261242878|    x|false| -1.4913896261242878|   2.224243016911143|
|-0.43435127561851733|    x| true| 0.43435127561851733| 0.18866103063143322|
| 0.28297849805199204|    x| true|-0.28297849805199204| 0.08007683035976126|
|  0.9973454465835858|    x| true| -0.9973454465835858|  0.9946979398210122|
|  -2.426679243393074|    y|false|   2.426679243393074|   5.888772150314783|
| -1.0856306033005612|    y|false|  1.0856306033005612|  1.1785938068227404|
|   1.651436537097151|    y|false|  -1.651436537097151|    2.72724263605943|
| -0.6788861516220543|    y| true|  0.6788861516220543| 0.46088640686420296|

Yes, it does matter how you sort because the first value in the sort section will supercede the other columns and they will sort themselves within the preceding column