In [28]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType
import datetime

In [29]:
spark = SparkSession.builder.appName("Preprocessing").getOrCreate()
sc = spark.sparkContext

In [60]:
# load the dataset
df = spark.read.csv("subset.csv", header=True, inferSchema=True)

In [61]:
# remove the following columns: urls, track_id, data, available markets, id, and date
# also remove title, artist, album, region, and name because they are strings not worth embedding for now
# also remove chart because I don't think there are enough charts for this to be relevant
# finally, remove index because dataframes already have an index
columns_to_remove = [
    "urls", "track_id", "data", "available_markets", "id", "url", "date",
    "title", "artist", "album", "region", "name", "chart", "index"
]
df = df.drop(*columns_to_remove)
df.show(5)

+----------+----+-------------+--------+----------+-----------+--------+------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|Unnamed: 0|rank|        trend| streams|popularity|duration_ms|explicit|release_date|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+----------+----+-------------+--------+----------+-----------+--------+------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|         0|   1|SAME_POSITION|253019.0|      78.0|   195840.0|   false|  2017-05-26|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0.159|     0.907| 102.034|              4.0|
|         1|   2|      MOVE_UP|223988.0|

In [62]:
# convert the date columns into a float representing the year
def date_to_year(date):
    try:
        return float(date.year)
    except:
        return None
    
date_to_year_udf = F.udf(date_to_year, FloatType())
df = df.withColumn("date", date_to_year_udf(F.col("release_date")))
df = df.drop("release_date")
df.show(5)

+----------+----+-------------+--------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|        trend| streams|popularity|duration_ms|explicit|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-------------+--------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|         0|   1|SAME_POSITION|253019.0|      78.0|   195840.0|   false|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0.159|     0.907| 102.034|              4.0|2017.0|
|         1|   2|      MOVE_UP|223988.0|      72.0|   259195.0| 

In [63]:
# convert the trend column into a scale from 0-2
def trend_to_scale(trend):
    if trend == "MOVE_UP":
        return 2.0
    elif trend == "MOVE_DOWN":
        return 0.0
    else:
        return 1.0

trend_to_scale_udf = F.udf(trend_to_scale, FloatType())
df = df.withColumn("trend", trend_to_scale_udf(F.col("trend")))
df.show(5)

+----------+----+-----+--------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|trend| streams|popularity|duration_ms|explicit|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-----+--------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|         0|   1|  1.0|253019.0|      78.0|   195840.0|   false|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0.159|     0.907| 102.034|              4.0|2017.0|
|         1|   2|  2.0|223988.0|      72.0|   259195.0|   false|          0.663|     0.92|  11.0

In [64]:
# remove explicit content, and then remove the explicit column
df = df.filter(F.col("explicit") == False)
df = df.drop("explicit")
df.show(5)

+----------+----+-----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|trend| streams|popularity|duration_ms|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|         0|   1|  1.0|253019.0|      78.0|   195840.0|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0.159|     0.907| 102.034|              4.0|2017.0|
|         1|   2|  2.0|223988.0|      72.0|   259195.0|          0.663|     0.92|  11.0|      -4.07|    0.0|         0.226|        0

In [65]:
# convert the rank column to a float
df = df.withColumn("rank", F.col("rank").cast(FloatType()))
df.show(5)

+----------+----+-----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|trend| streams|popularity|duration_ms|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|         0| 1.0|  1.0|253019.0|      78.0|   195840.0|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0.159|     0.907| 102.034|              4.0|2017.0|
|         1| 2.0|  2.0|223988.0|      72.0|   259195.0|          0.663|     0.92|  11.0|      -4.07|    0.0|         0.226|        0

In [66]:
# add the language identification column? We'll proabably have to do that separately later