In [1]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql.functions import col, expr

spark = pyspark.sql.SparkSession.builder.getOrCreate()

def get_spark_df():
    pandas_dataframe = pd.DataFrame(
        {
            "n": np.random.randn(20),
            "group": np.random.choice(list("xyz"), 20),
            "abool": np.random.choice([True, False], 20),
        }
    )
    return spark.createDataFrame(pandas_dataframe)

## Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.

- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [2]:
df = get_spark_df()

- Show the first 3 rows of the dataframe.

In [3]:
df.show(3, vertical=True)

-RECORD 0--------------------
 n     | 0.5211744212342075  
 group | z                   
 abool | false               
-RECORD 1--------------------
 n     | -1.0823124338092072 
 group | x                   
 abool | false               
-RECORD 2--------------------
 n     | 0.6433891104244117  
 group | y                   
 abool | true                
only showing top 3 rows



- Show the first 7 rows of the dataframe.

In [4]:
df.show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 0.5211744212342075|    z|false|
|-1.0823124338092072|    x|false|
| 0.6433891104244117|    y| true|
| 0.2786352745545001|    y|false|
| -0.641664448288473|    y|false|
|-0.8338823755513373|    x|false|
|-0.1339879766690345|    x| true|
+-------------------+-----+-----+
only showing top 7 rows



- View a summary of the data using .describe.

In [5]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.32994652224161247| null|
| stddev| 0.8975009823340346| null|
|    min|-1.5635945814766492|    x|
|    max|  2.180133093310002|    z|
+-------+-------------------+-----+



- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [6]:
df.drop('group').show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
| 0.5211744212342075|false|
|-1.0823124338092072|false|
| 0.6433891104244117| true|
| 0.2786352745545001|false|
| -0.641664448288473|false|
+-------------------+-----+
only showing top 5 rows



In [7]:
df.select(df.n, df.abool).show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
| 0.5211744212342075|false|
|-1.0823124338092072|false|
| 0.6433891104244117| true|
| 0.2786352745545001|false|
| -0.641664448288473|false|
+-------------------+-----+
only showing top 5 rows



- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [8]:
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    y| true|
|    y|false|
|    y|false|
+-----+-----+
only showing top 5 rows



- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [9]:
df.select('group', df.abool.alias('a_boolean_value'))
# same as
df.select('group', col('abool').alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    y|           true|
+-----+---------------+
only showing top 3 rows



- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [10]:
df.drop('abool').withColumnRenamed('n', 'a_numeric_value').show(6)

+-------------------+-----+
|    a_numeric_value|group|
+-------------------+-----+
| 0.5211744212342075|    z|
|-1.0823124338092072|    x|
| 0.6433891104244117|    y|
| 0.2786352745545001|    y|
| -0.641664448288473|    y|
|-0.8338823755513373|    x|
+-------------------+-----+
only showing top 6 rows



## Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

In [11]:
df = get_spark_df()

- Use .select to add 4 to the n column. Show the results.

In [12]:
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.6231572352126795|
| 4.810912977147855|
|3.8281297021599707|
|3.8545320512145014|
| 2.658996070807299|
+------------------+
only showing top 5 rows



- Subtract 5 from the n column and view the results.

In [13]:
df.select(col('n') - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
|-5.3768427647873205|
| -4.189087022852145|
| -5.171870297840029|
| -5.145467948785498|
| -6.341003929192701|
+-------------------+
only showing top 5 rows



- Multiply the n column by 2. View the results along with the original numbers.

In [14]:
df.select('n', col('n') * 2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
| -0.3768427647873204| -0.7536855295746407|
|  0.8109129771478555|   1.621825954295711|
| -0.1718702978400295|  -0.343740595680059|
|-0.14546794878549862|-0.29093589757099725|
| -1.3410039291927012| -2.6820078583854023|
+--------------------+--------------------+
only showing top 5 rows



- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [15]:
n2 = (col('n') * -1).alias('n2')
df.select('n', n2)
# same as
df.select('n', (col('n') * -1).alias('n2')).show(5)

+--------------------+-------------------+
|                   n|                 n2|
+--------------------+-------------------+
| -0.3768427647873204| 0.3768427647873204|
|  0.8109129771478555|-0.8109129771478555|
| -0.1718702978400295| 0.1718702978400295|
|-0.14546794878549862|0.14546794878549862|
| -1.3410039291927012| 1.3410039291927012|
+--------------------+-------------------+
only showing top 5 rows



- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [16]:
n3 = (col('n') ** 2).alias('n3')

In [17]:
df.select('n', n2, n3).show(5)

+--------------------+-------------------+-------------------+
|                   n|                 n2|                 n3|
+--------------------+-------------------+-------------------+
| -0.3768427647873204| 0.3768427647873204|0.14201046937255166|
|  0.8109129771478555|-0.8109129771478555| 0.6575798565067984|
| -0.1718702978400295| 0.1718702978400295|0.02953939927962045|
|-0.14546794878549862|0.14546794878549862|0.02116092412386045|
| -1.3410039291927012| 1.3410039291927012|  1.798291538110263|
+--------------------+-------------------+-------------------+
only showing top 5 rows



- What happens when you run the code below?

In [18]:
df.group + df.abool

Column<b'(group + abool)'>

- What happens when you run the code below? What is the difference between this and the previous code sample?

In [19]:
df.select(df.group + df.abool)

AnalysisException: "cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;;\n'Project [(cast(group#193 as double) + abool#194) AS (group + abool)#247]\n+- LogicalRDD [n#192, group#193, abool#194], false\n"

- Try adding various other columns together. What are the results of combining the different data types?

1. Spark SQL

    1. Use the starter code above to re-create a spark dataframe.
    1. Turn your dataframe into a table that can be queried with spark SQL. Name
       the table `my_df`. Answer the rest of the questions in this section with
       a spark sql query (`spark.sql`) against `my_df`. After each step, view
       the first 7 records from the dataframe.
    1. Write a query that shows all of the columns from your dataframe.
    1. Write a query that shows just the `n` and `abool` columns from the
       dataframe.
    1. Write a query that shows just the `n` and `group` columns. Rename the
       `group` column to `g`.
    1. Write a query that selects `n`, and creates two new columns: `n2`, the
       original `n` values halved, and `n3`: the original n values minus 1.
    1. What happens if you make a SQL syntax error in your query?

In [None]:
df = get_spark_df()
df.createOrReplaceTempView('df')

In [None]:
spark.sql('select group AS the_group from df').show(5)

In [None]:
# spark.sql('SELECT * FROM df')
# spark.sql('SELECT n, abool FROM df')
# spark.sql('SELECT n, group AS g FROM df')
spark.sql('''
SELECT n,
    n / 2 AS n2,
    n - 1 AS n3
FROM df
''').show(5)

In [None]:
spark.sql('alshbdflkasjbdf;kajsbndflkajbsd')

1. Type casting

    1. Use the starter code above to re-create a spark dataframe.

    1. Use `.printSchema` to view the datatypes in your dataframe.

    1. Use `.dtypes` to view the datatypes in your dataframe.

    1. What is the difference between the two code samples below?

        ```python
        df.abool.cast('int')
        ```

        ```python
        df.select(df.abool.cast('int')).show()
        ```

    1. Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.
    1. Convert the `group` column to a integer data type and view the results.
       What happens?
    1. Convert the `n` column to a integer data type and view the results. What
       happens?
    1. Convert the `abool` column to a string data type and view the results.
       What happens?

In [None]:
df = get_spark_df()
df.printSchema()
df.dtypes

In [None]:
df.select(df.group.cast('int')).show(5)

In [None]:
df.select(df.n, df.n.cast('int')).show(5)

In [None]:
df.select(df.abool.cast('string')).show(5)

1. Built-in Functions

    1. Use the starter code above to re-create a spark dataframe.
    1. Import the necessary functions from `pyspark.sql.functions`
    1. Find the highest `n` value.
    1. Find the lowest `n` value.
    1. Find the average `n` value.
    1. Use `concat` to change the `group` column to say, e.g. "Group: x" or
       "Group: y"
    1. Use `concat` to combine the `n` and `group` columns to produce results
       that look like this: "x: -1.432" or "z: 2.352"

In [22]:
def print(s):
    return None

In [30]:
print('Hi Bayes!')

In [20]:
max

<function max>

In [31]:
df = get_spark_df()

In [37]:
from pyspark.sql.functions import max, min, avg, mean

In [38]:
df.select(
    max(df.n),
    min(df.n),
    avg('n'),
    mean('n')
).show(5)

+-----------------+-------------------+-------------------+-------------------+
|           max(n)|             min(n)|             avg(n)|             avg(n)|
+-----------------+-------------------+-------------------+-------------------+
|1.397193793735044|-1.3638464037912605|0.14443112975715708|0.14443112975715708|
+-----------------+-------------------+-------------------+-------------------+



In [None]:
from pyspark.sql.functions as F

In [43]:
from pyspark.sql.functions import concat, lit, round

In [46]:
df.select(
    concat(lit('Group: '), 'group'),
    concat('group', lit(': '), round(df.n, 3))
).show(5)

+----------------------+------------------------------+
|concat(Group: , group)|concat(group, : , round(n, 3))|
+----------------------+------------------------------+
|              Group: z|                     z: -1.364|
|              Group: z|                      z: 0.204|
|              Group: y|                     y: -0.093|
|              Group: x|                     x: -0.625|
|              Group: x|                      x: 1.106|
+----------------------+------------------------------+
only showing top 5 rows



In [51]:
print = __builtins__.print

In [52]:
print('Hello, World!')

Hello, World!
