In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

import pandas as pd
import numpy as np

# 1. Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.

In [16]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

- Convert the pandas dataframe to a spark dataframe. From this pointforward, do all of your work with the spark dataframe, not the pandas dataframe.


In [17]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

- Show the first 3 rows of the dataframe.


In [18]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|0.059580016413558756|    y| true|
|-0.14438333575280743|    y| true|
| 0.18820665047642599|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



- Show the first 7 rows of the dataframe.


In [19]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|0.059580016413558756|    y| true|
|-0.14438333575280743|    y| true|
| 0.18820665047642599|    z|false|
| -0.5387728122665513|    x| true|
| -0.6283104660402473|    z| true|
|  -1.952541184161492|    z|false|
| -0.2768465919168419|    x| true|
+--------------------+-----+-----+
only showing top 7 rows



- View a summary of the data using `.describe`.


In [20]:
df.describe().show()

+-------+--------------------+-----+
|summary|                   n|group|
+-------+--------------------+-----+
|  count|                  20|   20|
|   mean|-0.19246628406661304| null|
| stddev|  0.7614096200861946| null|
|    min|  -1.952541184161492|    x|
|    max|  0.9483800040154895|    z|
+-------+--------------------+-----+



- Use `.select` to create a new dataframe with just the `n` and `abool` columns. View the first 5 rows of this dataframe.


In [10]:
df.select('n', 'abool').show()

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -1.748121098440469| true|
|  1.0489007446291436| true|
|   0.811213950663023|false|
|-0.44691940582022804|false|
|  -0.788282387030002|false|
|  1.1628909587577554| true|
| -0.1517425396227504| true|
|-0.18643580066904522| true|
| -1.5655970614940897| true|
| 0.24636122484049086|false|
|-0.03791876197241527| true|
| -0.1715863514186417|false|
| -0.7898457720264014| true|
|  1.2958479228264501|false|
|  1.0317108674974154|false|
|  1.4122890969159267|false|
|  1.2762895942230839| true|
|-0.24769852665924638| true|
|  0.5381738010235696| true|
|  1.1380773290966901|false|
+--------------------+-----+



- Use `.select` to create a new dataframe with just the `group` and `abool`columns. View the first 5 rows of this dataframe.


In [11]:
df.select('group', 'abool').show()

+-----+-----+
|group|abool|
+-----+-----+
|    x| true|
|    z| true|
|    y|false|
|    x|false|
|    x|false|
|    y| true|
|    z| true|
|    y| true|
|    z| true|
|    z|false|
|    y| true|
|    z|false|
|    y| true|
|    x|false|
|    y|false|
|    y|false|
|    y| true|
|    x| true|
|    z| true|
|    x|false|
+-----+-----+



- Use `.select` to create a new dataframe with the `group` `abool` column renamed to `a_boolean_value`. Show the first 3 rows ofthis dataframe.

In [12]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    x|           true|
|    z|           true|
|    y|          false|
+-----+---------------+
only showing top 3 rows



- Use `.select` to create a new dataframe with the `group` column and the `n` column renamed to `a_numeric_value`. Show the first 6 rows of this dataframe.

In [13]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    x|  -1.748121098440469|
|    z|  1.0489007446291436|
|    y|   0.811213950663023|
|    x|-0.44691940582022804|
|    x|  -0.788282387030002|
|    y|  1.1628909587577554|
+-----+--------------------+
only showing top 6 rows



# 2. Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the
       spark dataframe in a varaible named `df`



In [15]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

- Use `.select` to add 4 to the `n` column. Show the results.


In [21]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
| 4.059580016413559|
|3.8556166642471927|
| 4.188206650476426|
| 3.461227187733449|
| 3.371689533959753|
| 2.047458815838508|
|3.7231534080831583|
| 3.937423292049018|
|3.0672328406908793|
|  3.69941749752288|
| 4.517416392596189|
|3.2773769716385415|
|4.9483800040154895|
| 4.547046012793122|
|3.7076076490084384|
|2.3899428114706813|
| 4.849316118563628|
|3.5029498898563394|
|4.6079108764117445|
| 4.391721685298742|
+------------------+



- Subtract 5 from the `n` column and view the results.


In [22]:
df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -4.940419983586441|
| -5.144383335752807|
| -4.811793349523574|
| -5.538772812266552|
| -5.628310466040247|
|-6.9525411841614915|
| -5.276846591916842|
| -5.062576707950981|
| -5.932767159309121|
|  -5.30058250247712|
| -4.482583607403811|
|-5.7226230283614585|
|-4.0516199959845105|
| -4.452953987206878|
| -5.292392350991562|
| -6.610057188529319|
| -4.150683881436372|
| -5.497050110143661|
|-4.3920891235882555|
| -4.608278314701258|
+-------------------+



- Multiply the `n` column by 2. View the results along with the original numbers.


In [23]:
df.select(df.n * 2).show()

+--------------------+
|             (n * 2)|
+--------------------+
| 0.11916003282711751|
|-0.28876667150561486|
| 0.37641330095285197|
| -1.0775456245331025|
| -1.2566209320804946|
|  -3.905082368322984|
| -0.5536931838336838|
| -0.1251534159019633|
| -1.8655343186182418|
| -0.6011650049542393|
|  1.0348327851923784|
| -1.4452460567229166|
|   1.896760008030979|
|  1.0940920255862447|
| -0.5847847019831229|
| -3.2201143770586373|
|   1.698632237127256|
| -0.9941002202873216|
|  1.2158217528234887|
|  0.7834433705974827|
+--------------------+



- Add a new column named `n2` that is the `n` value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original `n` value as well as `n2`.


In [26]:
n2 = (df.n * -1).alias('n2')
df = df.select('*', n2)
df.show()

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|0.059580016413558756|    y| true|-0.05958001641355...|
|-0.14438333575280743|    y| true| 0.14438333575280743|
| 0.18820665047642599|    z|false|-0.18820665047642599|
| -0.5387728122665513|    x| true|  0.5387728122665513|
| -0.6283104660402473|    z| true|  0.6283104660402473|
|  -1.952541184161492|    z|false|   1.952541184161492|
| -0.2768465919168419|    x| true|  0.2768465919168419|
|-0.06257670795098165|    x| true| 0.06257670795098165|
| -0.9327671593091209|    x|false|  0.9327671593091209|
|-0.30058250247711965|    y| true| 0.30058250247711965|
|  0.5174163925961892|    x| true| -0.5174163925961892|
| -0.7226230283614583|    z|false|  0.7226230283614583|
|  0.9483800040154895|    y| true| -0.9483800040154895|
|  0.5470460127931224|    x| true| -0.5470460127931224|
|-0.29239235099156147|    x|false| 0.29239235099

- Add a new column named `n3` that is the n value squared. Show the first 5 rows of your dataframe. You should see both `n`, `n2`, and `n3`.


In [27]:
n3 = (df.n ** 2).alias('n3')
df = df.select('*', n3)
df.show()

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|0.059580016413558756|    y| true|-0.05958001641355...|0.003549778355839931|
|-0.14438333575280743|    y| true| 0.14438333575280743| 0.02084654764310792|
| 0.18820665047642599|    z|false|-0.18820665047642599| 0.03542174328355558|
| -0.5387728122665513|    x| true|  0.5387728122665513|  0.2902761432376085|
| -0.6283104660402473|    z| true|  0.6283104660402473|  0.3947740417357128|
|  -1.952541184161492|    z|false|   1.952541184161492|  3.8124170758467613|
| -0.2768465919168419|    x| true|  0.2768465919168419|  0.0766440354559704|
|-0.06257670795098165|    x| true| 0.06257670795098165| 0.00391584437798245|
| -0.9327671593091209|    x|false|  0.9327671593091209|   0.870054573485607|
|-0.30058250247711965|    y| true| 0.30058250247711965| 0.09034984079540764|

- What happens when you run the code below?
        ```python
        df.group + df.abool
        ```


In [28]:
df.group + df.abool

Column<'(group + abool)'>

- What happens when you run the code below? What is the difference between this and the previous code sample?
        ```python
        df.select(df.group + df.abool)
        ```


In [29]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#312 as double) + abool#313) AS (group + abool)#574]
+- Project [n#311, group#312, abool#313, n2#525, POWER(n#311, cast(2 as double)) AS n3#547]
   +- Project [n#311, group#312, abool#313, (n#311 * cast(-1 as double)) AS n2#525]
      +- LogicalRDD [n#311, group#312, abool#313], false


- Try adding various other columns together. What are the results of combining the different data types?