In [45]:
from lingua import Language, LanguageDetectorBuilder
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType
import re

In [46]:
spark = SparkSession.builder.appName("LID").getOrCreate()
sc = spark.sparkContext

In [47]:
# because this is the small dataset, I'm making the assumption that all the songs are either in Spanish or English
languages = [
    Language.ENGLISH,
    Language.SPANISH,
    Language.ARABIC,
    Language.RUSSIAN,
    Language.GERMAN,
    Language.FRENCH,
    Language.ITALIAN,
    Language.SWEDISH,
    Language.FINNISH,
    Language.POLISH,
    Language.BULGARIAN,
    Language.ROMANIAN,
    Language.HUNGARIAN,
    Language.GREEK,
    Language.TURKISH,
    Language.HINDI,
    Language.JAPANESE,
    Language.KOREAN,
    Language.VIETNAMESE,
    Language.THAI,
    Language.INDONESIAN,
    Language.PORTUGUESE,
    Language.PUNJABI,
    Language.TAMIL,
    Language.TELUGU,
    Language.TAGALOG,
]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# example usage
language = detector.detect_language_of("Cuatro Babys")
print(languages.index(language))  # should print 1 for Spanish

1


In [48]:
df = spark.read.csv("partition1.csv", header=True, inferSchema=True)

# drop rows where the language id is 0, or the title or album is empty
df = df.filter(F.col("title").isNotNull())
df = df.filter(F.col("album").isNotNull())

df.show(5)

+--------+--------------------+--------------------+--------------------+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+
|     _c0|               title|              artist|               album|af_danceability|af_energy|af_loudness|af_speechiness|af_acousticness|af_instrumentalness|af_valence|af_tempo|
+--------+--------------------+--------------------+--------------------+---------------+---------+-----------+--------------+---------------+-------------------+----------+--------+
|  485174|                   !|            O.S.T.R.|W drodze po szczę...|          0.561|    0.798|     -7.064|          0.32|          0.307|                0.0|      0.14| 140.837|
|13259484|"""Eungenio"" Sal...|              Mecano|Descanso Dominica...|          0.381|    0.199|    -15.769|         0.052|          0.925|            6.79E-4|    0.0456| 145.726|
|  486856|"A Lovely Night -...|Ryan Gosling, Emm...|La La Land (Origi...|            

In [None]:
pattern = r"\s*[\(\[].*?[\)\]]"

def detect_language_udf(title, album):
    cleaned_title = re.sub(pattern, "", title, flags=re.IGNORECASE).strip()
    cleaned_album = re.sub(pattern, "", album, flags=re.IGNORECASE).strip()
    if cleaned_title == cleaned_album:
        combined_text = cleaned_title
    else:
        combined_text = f"{cleaned_title} {cleaned_album}"
    if not combined_text.strip():
        return -1.0  # return -1.0 for empty strings, but we'll remove these afterwards
    
    detector = LanguageDetectorBuilder.from_languages(*languages).build()
    language = detector.detect_language_of(combined_text)

    confidence_values = detector.compute_language_confidence_values(combined_text)
    if confidence_values[0].value < 0.5:
        return -2.0  # return -2.0 if confidence is below 50%

    return float(languages.index(language))

detect_language = F.udf(detect_language_udf, FloatType())

df = df.withColumn("language_id", detect_language(F.col("title"), F.col("album")))

# filter out language id's where we couldn't detect a language
df = df.filter(F.col("language_id") != -1.0)

new_df = df.select("title", "album", "language_id")
new_df.show(30)

[Stage 39:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-----------+
|               title|               album|language_id|
+--------------------+--------------------+-----------+
|"""Eungenio"" Sal...|Descanso Dominica...|       -2.0|
|"Aattuthottil - F...|"Aattuthottil (Fr...|       -2.0|
|"After Met You (F...|"After Met You (F...|       -2.0|
|"Akala ko - From ...|"Akala ko (From "...|       -2.0|
|"Arey Pyaar Kar L...|"Arey Pyaar Kar L...|       -2.0|
|"Arikil - From ""...|"Arikil (From ""A...|       -2.0|
|"Bandeya (feat. A...|"Bandeya (From ""...|       -2.0|
|"Bang Bang - From...|"Bang Bang (From ...|       -2.0|
|"Check The Timin'...|"Check The Timin'...|       -2.0|
|"Coconut Mall (Fr...|Mario Kart Wii, T...|       -2.0|
|"Credits - From "...|La La Land (Origi...|       -2.0|
|"Dil Mein Ho Tum ...|"Dil Mein Ho Tum ...|       -2.0|
|"Dil Royi Jaye (F...|"Dil Royi Jaye (F...|       -2.0|
|"Ek Ladki Ko Dekh...|"Ek Ladki Ko Dekh...|       -2.0|
|"El corrido de Mi...|Coco (Banda Sonor...|     

                                                                                

In [None]:
# from pyspark.sql.types import FloatType, StringType
# import pyspark.sql.functions as F
# import re

# pattern = r"\s*[\(\[].*?[\)\]]"

# # UDF 1: preprocess title + album into combined text
# def preprocess_text_udf(title, album):
#     cleaned_title = re.sub(pattern, "", title or "", flags=re.IGNORECASE).strip()
#     cleaned_album = re.sub(pattern, "", album or "", flags=re.IGNORECASE).strip()
#     if cleaned_title == cleaned_album:
#         combined_text = cleaned_title
#     else:
#         combined_text = f"{cleaned_title} {cleaned_album}"
#     return combined_text

# preprocess_text = F.udf(preprocess_text_udf, StringType())

# # Apply preprocessing UDF
# df = df.withColumn("combined_text", preprocess_text(F.col("title"), F.col("album")))

# # UDF 2: detect language from combined text
# def detect_language_index_udf(combined_text):
#     if not combined_text.strip():
#         return -1.0  # empty string

#     detector = LanguageDetectorBuilder.from_languages(*languages).build()
#     language = detector.detect_language_of(combined_text)
#     confidence_values = detector.compute_language_confidence_values(combined_text)
#     if confidence_values[0].value < 0.5:
#         return -2.0  # low confidence

#     return float(languages.index(language))

# detect_language_index = F.udf(detect_language_index_udf, FloatType())

# # Apply language detection UDF
# df = df.withColumn("language_id", detect_language_index(F.col("combined_text")))

# # Filter out rows where detection failed
# df = df.filter(F.col("language_id") != -1.0)

# # View resulting DataFrame with both combined text and language index
# new_df = df.select("title", "album", "combined_text", "language_id")
# new_df.show(30, truncate=50)


[Stage 40:>                                                         (0 + 1) / 1]

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------+
|                                             title|                                             album|                                     combined_text|language_id|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------+
|                      """Eungenio"" Salvador Dali"|       Descanso Dominical/Une Femme Avec Une Femme|"""Eungenio"" Salvador Dali" Descanso Dominical...|       -2.0|
|                 "Aattuthottil - From ""Athiran"""|                 "Aattuthottil (From ""Athiran"")"|  "Aattuthottil - From ""Athiran""" "Aattuthottil"|       -2.0|
|          "After Met You (From ""After Met You"")"|          "After Met You (From ""After Met You"")"|                                   "After Met You"|       -2.0

                                                                                

In [50]:
test_title = "\"Aaina (From \"\"The Body\"\")\""
test_album = "\"Aaina (From \"\"The Body\"\")\""
cleaned_test_title = re.sub(pattern, "", test_title, flags=re.IGNORECASE).strip()
cleaned_test_album = re.sub(pattern, "", test_album, flags=re.IGNORECASE).strip()
if cleaned_test_album == cleaned_test_album:
    combined_text = cleaned_test_title
else:
    combined_text = f"{cleaned_test_title} {cleaned_test_album}"
print(f"Cleaned: {combined_text}")
print(len(combined_text.strip().split()))

Cleaned: "Aaina"
1


In [51]:
# new_df.write.csv("subset_with_language_id.csv", header=True, mode="overwrite")

In [None]:
confidence_values = detector.compute_language_confidence_values("La bikina - Inspirado en \"COCO\" La bikina")
for confidence in confidence_values:
    print(f"{confidence.language.name}: {confidence.value:.2f}")

print(confidence_values[0].value)

TAGALOG: 0.22
SPANISH: 0.18
PORTUGUESE: 0.14
INDONESIAN: 0.08
FRENCH: 0.08
ROMANIAN: 0.06
POLISH: 0.05
FINNISH: 0.04
ITALIAN: 0.04
ENGLISH: 0.03
GERMAN: 0.02
SWEDISH: 0.02
HUNGARIAN: 0.02
TURKISH: 0.01
VIETNAMESE: 0.00
ARABIC: 0.00
BULGARIAN: 0.00
GREEK: 0.00
HINDI: 0.00
JAPANESE: 0.00
KOREAN: 0.00
PUNJABI: 0.00
RUSSIAN: 0.00
TAMIL: 0.00
TELUGU: 0.00
THAI: 0.00
0.22155691676525158


In [53]:
from langdetect import detect

print(detect("Arey Pyaar Kar Le"))

tr


In [54]:
lang_list = [
    "af", "ar", "bg", "bn", "ca", "cs", "cy", "da", "de", "el",
    "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr",
    "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml",
    "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk",
    "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr",
    "uk", "ur", "vi", "zh-cn", "zh-tw"
]
print(len(lang_list))

55
