In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [7]:
spark = SparkSession.builder.appName('myspark').getOrCreate()
csv_file_path = "/Users/nombauser/Desktop/GIT/MyGitRepos/Learn-PySpark/files/sample_1.csv"

In [8]:
df = (spark.read.format('csv')
      .option('header', "true")
      .option('inferSchema', "true")
      .load(csv_file_path))

In [18]:
df.printSchema()

root
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_artist: string (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- track_album_id: string (nullable = true)
 |-- track_album_name: string (nullable = true)
 |-- track_album_release_date: string (nullable = true)
 |-- playlist_name: string (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- playlist_genre: string (nullable = true)
 |-- playlist_subgenre: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_ms: double (nullable = true)


#### `union` and `unionAll`

In [None]:
# The data schema must be same

# First dataframe
df1 = (df.select(
    "track_id", "track_name", "key", "acousticness"
).where(col('key') == 11).limit(5))

df1.show(truncate=False)


# Second dataframe
df2 = (df.select("track_id", "track_name", "key", "acousticness")
    .filter(
        (col('track_name') == 'Memories - Dillon Francis Remix') | 
        (col('key') == 10) )
    .limit(4)) 
df2.show(truncate=False)

+----------------------+------------------------------------------------------+----+------------+
|track_id              |track_name                                            |key |acousticness|
+----------------------+------------------------------------------------------+----+------------+
|0r7CVbZTWZgbTCYdfa2P31|Memories - Dillon Francis Remix                       |11.0|0.0724      |
|4sqABRRGU7CzcHXCyxUzFw|Do You Mean (feat. Ty Dolla $ign & bülow) - Myon Remix|11.0|0.0163      |
|5dy3WUywjZcalTno1io8TQ|Hold Me While You Wait - Steve Void Remix             |11.0|0.104       |
|1aT7vltGhkjyMI3F3aO3A4|Mother - Fedde Le Grand Remix                         |11.0|0.0319      |
|0Fc9QSMX6fgEBgDjUax7t7|Ruin My Life - Steve James Remix                      |11.0|0.0103      |
+----------------------+------------------------------------------------------+----+------------+

+----------------------+---------------------------------------------------------------+----+------------+
|track_id 

In [26]:
# Using union 
df_union = df1.union(df2)
print(df_union.count())
df_union.show()


9
+--------------------+--------------------+----+------------+
|            track_id|          track_name| key|acousticness|
+--------------------+--------------------+----+------------+
|0r7CVbZTWZgbTCYdf...|Memories - Dillon...|11.0|      0.0724|
|4sqABRRGU7CzcHXCy...|Do You Mean (feat...|11.0|      0.0163|
|5dy3WUywjZcalTno1...|Hold Me While You...|11.0|       0.104|
|1aT7vltGhkjyMI3F3...|Mother - Fedde Le...|11.0|      0.0319|
|0Fc9QSMX6fgEBgDjU...|Ruin My Life - St...|11.0|      0.0103|
|0r7CVbZTWZgbTCYdf...|Memories - Dillon...|11.0|      0.0724|
|3TjLsDgL0bTbSQIF6...|Say My Name (feat...|10.0|      0.0395|
|7p4jpmyBnOpqQ3rSG...|Quite Miss Home -...|10.0|       0.108|
|2k4c9ZAlIYHyXiJ4i...|When It Comes To ...|10.0|      0.0538|
+--------------------+--------------------+----+------------+



In [27]:
# Union using distinct

df_union_distinct = df1.union(df2).distinct()
df_union_distinct.show()

+--------------------+--------------------+----+------------+
|            track_id|          track_name| key|acousticness|
+--------------------+--------------------+----+------------+
|0Fc9QSMX6fgEBgDjU...|Ruin My Life - St...|11.0|      0.0103|
|1aT7vltGhkjyMI3F3...|Mother - Fedde Le...|11.0|      0.0319|
|5dy3WUywjZcalTno1...|Hold Me While You...|11.0|       0.104|
|4sqABRRGU7CzcHXCy...|Do You Mean (feat...|11.0|      0.0163|
|0r7CVbZTWZgbTCYdf...|Memories - Dillon...|11.0|      0.0724|
|7p4jpmyBnOpqQ3rSG...|Quite Miss Home -...|10.0|       0.108|
|3TjLsDgL0bTbSQIF6...|Say My Name (feat...|10.0|      0.0395|
|2k4c9ZAlIYHyXiJ4i...|When It Comes To ...|10.0|      0.0538|
+--------------------+--------------------+----+------------+



In [28]:
# unionAll() - works same as union() in modern PySpark

df_union_all = df1.unionAll(df2)
df_union_all.show()

+--------------------+--------------------+----+------------+
|            track_id|          track_name| key|acousticness|
+--------------------+--------------------+----+------------+
|0r7CVbZTWZgbTCYdf...|Memories - Dillon...|11.0|      0.0724|
|4sqABRRGU7CzcHXCy...|Do You Mean (feat...|11.0|      0.0163|
|5dy3WUywjZcalTno1...|Hold Me While You...|11.0|       0.104|
|1aT7vltGhkjyMI3F3...|Mother - Fedde Le...|11.0|      0.0319|
|0Fc9QSMX6fgEBgDjU...|Ruin My Life - St...|11.0|      0.0103|
|0r7CVbZTWZgbTCYdf...|Memories - Dillon...|11.0|      0.0724|
|3TjLsDgL0bTbSQIF6...|Say My Name (feat...|10.0|      0.0395|
|7p4jpmyBnOpqQ3rSG...|Quite Miss Home -...|10.0|       0.108|
|2k4c9ZAlIYHyXiJ4i...|When It Comes To ...|10.0|      0.0538|
+--------------------+--------------------+----+------------+

