In [10]:
from lingua import Language, LanguageDetectorBuilder
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType
import re

In [2]:
spark = SparkSession.builder.appName("LID").getOrCreate()
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/21 15:53:45 WARN Utils: Your hostname, Bens-MacBook-Air-7.local, resolves to a loopback address: 127.0.0.1; using 172.20.60.48 instead (on interface en0)
25/11/21 15:53:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/21 15:53:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/21 15:53:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/21 15:53:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [7]:
# because this is the small dataset, I'm making the assumption that all the songs are either in Spanish or English
languages = [Language.ENGLISH, Language.SPANISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# example usage
language = detector.detect_language_of("Cuatro Babys")
print(languages.index(language))  # should print 1 for Spanish

1


In [5]:
df = spark.read.csv("subset.csv", header=True, inferSchema=True)

df.show(5)

+----------+--------------------+----+----------+--------------------+--------------------+---------+------+-------------+--------+--------------------+--------------------+----------+-----------+--------+------------+--------------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|Unnamed: 0|               title|rank|      date|              artist|                 url|   region| chart|        trend| streams|            track_id|               album|popularity|duration_ms|explicit|release_date|   available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+----------+--------------------+----+----------+--------------------+--------------------+---------+------+-------------+--------+--------------------+--------------------+----------+-----------+--------+-------

25/11/21 15:54:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [None]:
pattern = r"\s*[\(\[].*?[\)\]]"

def detect_language_udf(title, album):
    cleaned_title = re.sub(pattern, "", title, flags=re.IGNORECASE).strip()
    cleaned_album = re.sub(pattern, "", album, flags=re.IGNORECASE).strip()
    combined_text = f"{cleaned_title} {cleaned_album}"
    if not combined_text.strip():
        return 0.0  # return 0.0 for empty strings to just guess - we might want to make this more sophisticated later

    detector = LanguageDetectorBuilder.from_languages(*languages).build()
    language = detector.detect_language_of(combined_text)
    return float(languages.index(language))

detect_language = F.udf(detect_language_udf, FloatType())

df_with_lang = df.withColumn("language_id", detect_language(F.col("title"), F.col("album")))

new_df = df_with_lang.select("title", "album", "language_id")
new_df.show(5)

+--------------------+--------------------+-----------+
|               title|               album|language_id|
+--------------------+--------------------+-----------+
|Chantaje (feat. M...|           El Dorado|        1.0|
|Vente Pa' Ca (fea...|Vente Pa' Ca (fea...|        1.0|
|Reggaetón Lento (...|        Primera Cita|        1.0|
|              Safari|             Energía|        1.0|
|         Shaky Shaky|         Shaky Shaky|        0.0|
+--------------------+--------------------+-----------+
only showing top 5 rows


In [20]:
new_df.write.csv("subset_with_language_id.csv", header=True, mode="overwrite")