In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType
import datetime

In [2]:
# language detection dependentcies
from lingua import Language, LanguageDetectorBuilder
import re

In [3]:
spark = SparkSession.builder.appName("Preprocessing").getOrCreate()
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/29 17:28:34 WARN Utils: Your hostname, Bens-MacBook-Air-7.local, resolves to a loopback address: 127.0.0.1; using 172.20.10.3 instead (on interface en0)
25/11/29 17:28:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/29 17:28:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# load the dataset
df = spark.read.csv("subset.csv", header=True, inferSchema=True)

In [5]:
# for now we'll do the language detection first, but this won't work at scale so we'll have to do somethign else later
languages = [Language.ENGLISH, Language.SPANISH]
pattern = r"\s*[\(\[].*?[\)\]]"

def detect_language_udf(title, album):
    cleaned_title = re.sub(pattern, "", title, flags=re.IGNORECASE).strip()
    cleaned_album = re.sub(pattern, "", album, flags=re.IGNORECASE).strip()
    combined_text = f"{cleaned_title} {cleaned_album}"
    if not combined_text.strip():
        return 0.0  # return 0.0 for empty strings to just guess - we might want to make this more sophisticated later

    detector = LanguageDetectorBuilder.from_languages(*languages).build()
    language = detector.detect_language_of(combined_text)
    return float(languages.index(language))

detect_language = F.udf(detect_language_udf, FloatType())

df_with_lang = df.withColumn("language_id", detect_language(F.col("title"), F.col("album")))

new_df = df_with_lang.select("title", "album", "language_id")
new_df.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----------+
|               title|               album|language_id|
+--------------------+--------------------+-----------+
|Chantaje (feat. M...|           El Dorado|        1.0|
|Vente Pa' Ca (fea...|Vente Pa' Ca (fea...|        1.0|
|Reggaetón Lento (...|        Primera Cita|        1.0|
|              Safari|             Energía|        1.0|
|         Shaky Shaky|         Shaky Shaky|        0.0|
+--------------------+--------------------+-----------+
only showing top 5 rows


                                                                                

In [6]:
# remove the following columns: urls, track_id, data, available markets, id, and date
# also remove artist, album, region, and name because they are strings not worth embedding for now
# also remove chart because I don't think there are enough charts for this to be relevant
# finally, remove index because dataframes already have an index
columns_to_remove = [
    "urls", "track_id", "data", "available_markets", "id", "url", "date",
    "artist", "album", "region", "name", "chart"
]
df = df.drop(*columns_to_remove)

# df = df.drop(*columns_to_remove)
df.show(5)

+----------+--------------------+----+-------------+--------+----------+-----------+--------+------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|Unnamed: 0|               title|rank|        trend| streams|popularity|duration_ms|explicit|release_date|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+----------+--------------------+----+-------------+--------+----------+-----------+--------+------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|         0|Chantaje (feat. M...|   1|SAME_POSITION|253019.0|      78.0|   195840.0|   false|  2017-05-26|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|            3.05E-5|      0

In [7]:
# remove duplicates by checking to see if any titles match
df = df.dropDuplicates(["title"])
df = df.drop("title")

In [8]:
# convert the date columns into a float representing the year
def date_to_year(date):
    try:
        return float(date.year)
    except:
        return None
    
date_to_year_udf = F.udf(date_to_year, FloatType())
df = df.withColumn("date", date_to_year_udf(F.col("release_date")))
df = df.drop("release_date")
df.show(5)

25/11/29 17:28:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+----+-------------+-------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|        trend|streams|popularity|duration_ms|explicit|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-------------+-------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|        34|  35|      MOVE_UP|56170.0|      80.0|   236001.0|   false|          0.666|     0.83|   0.0|     -5.715|    1.0|        0.0751|         0.0123|                0.0|      0.191|     0.702|  113.03|              4.0|2016.0|
|        33|  34|    MOVE_DOWN|58801.0|      80.0|   225983.0|   fal

In [9]:
# convert the trend column into a scale from 0-2
def trend_to_scale(trend):
    if trend == "MOVE_UP":
        return 2.0
    elif trend == "MOVE_DOWN":
        return 0.0
    else:
        return 1.0

trend_to_scale_udf = F.udf(trend_to_scale, FloatType())
df = df.withColumn("trend", trend_to_scale_udf(F.col("trend")))
df.show(5)

+----------+----+-----+-------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|trend|streams|popularity|duration_ms|explicit|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-----+-------+----------+-----------+--------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|        34|  35|  2.0|56170.0|      80.0|   236001.0|   false|          0.666|     0.83|   0.0|     -5.715|    1.0|        0.0751|         0.0123|                0.0|      0.191|     0.702|  113.03|              4.0|2016.0|
|        33|  34|  0.0|58801.0|      80.0|   225983.0|   false|          0.818|    0.803|   1.0|    

In [10]:
# remove explicit content, and then remove the explicit column
df = df.filter(F.col("explicit") == False)
df = df.drop("explicit")
df.show(5)

+----------+----+-----+-------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0|rank|trend|streams|popularity|duration_ms|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+----+-----+-------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|        34|  35|  2.0|56170.0|      80.0|   236001.0|          0.666|     0.83|   0.0|     -5.715|    1.0|        0.0751|         0.0123|                0.0|      0.191|     0.702|  113.03|              4.0|2016.0|
|        33|  34|  0.0|58801.0|      80.0|   225983.0|          0.818|    0.803|   1.0|     -4.282|    1.0|        0.0797|          0.03

In [11]:
# convert the rank column to a float
df = df.withColumn("rank", F.col("rank").cast(FloatType()))
df.show(5)

+----------+-----+-----+-------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|Unnamed: 0| rank|trend|streams|popularity|duration_ms|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|  date|
+----------+-----+-----+-------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+------+
|        34| 35.0|  2.0|56170.0|      80.0|   236001.0|          0.666|     0.83|   0.0|     -5.715|    1.0|        0.0751|         0.0123|                0.0|      0.191|     0.702|  113.03|              4.0|2016.0|
|        33| 34.0|  0.0|58801.0|      80.0|   225983.0|          0.818|    0.803|   1.0|     -4.282|    1.0|        0.0797|         

In [12]:
# df.write.csv("numpy_prepped.csv", header=True, mode="overwrite")