In [11]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql.functions import col, expr

spark = pyspark.sql.SparkSession.builder.getOrCreate()

def get_spark_df():
    pandas_dataframe = pd.DataFrame(
        {
            "n": np.random.randn(20),
            "group": np.random.choice(list("xyz"), 20),
            "abool": np.random.choice([True, False], 20),
        }
    )
    return spark.createDataFrame(pandas_dataframe)

## Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.

- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [3]:
df = get_spark_df()

- Show the first 3 rows of the dataframe.

In [20]:
df.show(3, vertical=True)

-RECORD 0---------------------
 n     | -0.0381157591819841  
 group | z                    
 abool | true                 
-RECORD 1---------------------
 n     | -0.09059919224393369 
 group | x                    
 abool | false                
-RECORD 2---------------------
 n     | 1.6348501665759412   
 group | y                    
 abool | false                
only showing top 3 rows



- Show the first 7 rows of the dataframe.

In [5]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.33316050960226473|    y|false|
| 0.10176239119505157|    x|false|
| -1.1975667202718223|    x|false|
|0.025058939326273226|    y| true|
|0.031037124872786977|    z| true|
|  0.3302575638035759|    z| true|
|  1.2156196750619137|    z| true|
+--------------------+-----+-----+
only showing top 7 rows



- View a summary of the data using .describe.

In [6]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.13485210537268788| null|
| stddev| 1.1100719050164813| null|
|    min|  -2.05666537539405|    x|
|    max| 2.7205038596158073|    z|
+-------+-------------------+-----+



- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [21]:
df.drop('group').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
| -0.0381157591819841| true|
|-0.09059919224393369|false|
|  1.6348501665759412|false|
| 0.41116963298861514| true|
|  0.6300858706255195| true|
+--------------------+-----+
only showing top 5 rows



In [7]:
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
| 0.33316050960226473|false|
| 0.10176239119505157|false|
| -1.1975667202718223|false|
|0.025058939326273226| true|
|0.031037124872786977| true|
+--------------------+-----+
only showing top 5 rows



- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [9]:
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    y|false|
|    x|false|
|    x|false|
|    y| true|
|    z| true|
+-----+-----+
only showing top 5 rows



- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [13]:
df.select('group', df.abool.alias('a_boolean_value'))
# same as
df.select('group', col('abool').alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    y|          false|
|    x|          false|
|    x|          false|
+-----+---------------+
only showing top 3 rows



- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [16]:
df.drop('abool').withColumnRenamed('n', 'a_numeric_value').show(6)

+--------------------+-----+
|     a_numeric_value|group|
+--------------------+-----+
| 0.33316050960226473|    y|
| 0.10176239119505157|    x|
| -1.1975667202718223|    x|
|0.025058939326273226|    y|
|0.031037124872786977|    z|
|  0.3302575638035759|    z|
+--------------------+-----+
only showing top 6 rows



## Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

In [17]:
df = get_spark_df()

- Use .select to add 4 to the n column. Show the results.

In [19]:
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.9618842408180157|
|3.9094008077560662|
| 5.634850166575941|
| 4.411169632988615|
|4.6300858706255195|
+------------------+
only showing top 5 rows



- Subtract 5 from the n column and view the results.

In [24]:
df.select(col('n') - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.038115759181984|
| -5.090599192243934|
| -3.365149833424059|
| -4.588830367011385|
|-4.3699141293744805|
+-------------------+
only showing top 5 rows



- Multiply the n column by 2. View the results along with the original numbers.

In [25]:
df.select('n', col('n') * 2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
| -0.0381157591819841| -0.0762315183639682|
|-0.09059919224393369|-0.18119838448786738|
|  1.6348501665759412|  3.2697003331518824|
| 0.41116963298861514|  0.8223392659772303|
|  0.6300858706255195|   1.260171741251039|
+--------------------+--------------------+
only showing top 5 rows



- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [29]:
n2 = (col('n') * -1).alias('n2')
df.select('n', n2)
# same as
df.select('n', (col('n') * -1).alias('n2')).show(5)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
| -0.0381157591819841|  0.0381157591819841|
|-0.09059919224393369| 0.09059919224393369|
|  1.6348501665759412| -1.6348501665759412|
| 0.41116963298861514|-0.41116963298861514|
|  0.6300858706255195| -0.6300858706255195|
+--------------------+--------------------+
only showing top 5 rows



- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [30]:
n3 = (col('n') ** 2).alias('n3')

In [31]:
df.select('n', n2, n3).show(5)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
| -0.0381157591819841|  0.0381157591819841|0.001452811098019...|
|-0.09059919224393369| 0.09059919224393369|0.008208213635253253|
|  1.6348501665759412| -1.6348501665759412|  2.6727350671533827|
| 0.41116963298861514|-0.41116963298861514| 0.16906046709199246|
|  0.6300858706255195| -0.6300858706255195| 0.39700820436191886|
+--------------------+--------------------+--------------------+
only showing top 5 rows



- What happens when you run the code below?

In [32]:
df.group + df.abool

Column<b'(group + abool)'>

- What happens when you run the code below? What is the difference between this and the previous code sample?

In [33]:
df.select(df.group + df.abool)

AnalysisException: "cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;;\n'Project [(cast(group#202 as double) + abool#203) AS (group + abool)#304]\n+- LogicalRDD [n#201, group#202, abool#203], false\n"

- Try adding various other columns together. What are the results of combining the different data types?