In [0]:
import os
import io
import pyspark
import numpy as np
import matplotlib.pyplot as plt
import pyspark.sql.functions as F

from pyspark.sql import SparkSession, Window
from pyspark.conf import SparkConf
# from pyspark.context import SparkContext
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType, IntegerType

from concurrent.futures import ThreadPoolExecutor

%load_ext autoreload
%autoreload 2

In [0]:
# `sparksession is none: typeerror: 'javapackage' object is not 
# callable` can be raised if the pyspark version being used is 4.0.0
# which is not compatible to a python 3.11.8 version

# if we have 24 gb of installed ram and 23 gb usable and have
# 8 cores in our CPU we can utilize this memory to partition 
# it across all 8 cores of our cpu for concurrent processing 
# in spark. We need to also take note of background processes 
# in our task manager taking up our memory so if need be we 
# have to end these background processes that take up too much 
# memory in order to free up space for our spark driver memory 
# and spark executor memory

# lets say we have 8 cores per node/CPU and currently 23gb usable ram
# we can partition this 23gb ram across all 8 cores of the CPU
# since there are other background processes we can reserve 1 core
# for this as well as 1gb of ram, and so we will have only 7 cores 
# available and 22gb of ram

# executors utilize cores the ff. are different kinds of executor
# sizes
# executor 1: [<core 1>, <core 2>, ..., <core 7>] where 22gb of ram is
# spread out across the executors. Since there is only a single executor 
# here executor will only have 22gb of memory and then this memory will 
# be divided into its individual cores which currently have 7 so 22 / 7 
# is 3gb of memory per core

# the main idea is we can have any number of executors so long as there
# are any number of cores but we cannot have any number of cores for
# any number of executors. If there are 7 cores we can have at most
# 7 executors and divide the ram across these executors and their cores
# themselves. 

# executor 1: [<core 1>]
# executor 2: [<core 2>]
# executor 3: [<core 3>]
# executor 4: [<core 4>]
# executor 5: [<core 5>]
# executor 6: [<core 6>]
# executor 1: [<core 7>]
# where the 22fb of ram we have is spread out across these executors
# if we have 7 executors we will have 22 / 7 or 3gb. Therefore 3gb will
# be the memory of each executor with one actually 4gb as 3gb + 3gb + 
# 3gb + 3gb + 3gb + 3gb + 4gb = 22gb. And 

# again our starting memory and cores is 24gb and 8 cores
# we will minus 1gb and 1core for yarn/hadoop processes
# making it 23gb and 7 cores. Yarn application master
# can take either 1gb of ram or 1 core therefore it may make
# our total memory and cores 22gb and 7 cores or 23gb and 6 cores;
# say we picked the former. Now we choose our number of executors
# which can be the mid range of our number of cores. Say we want
# 3 executors then each executor will have 22gb / 3 or 7gb, 7gb, 
# 8gb respectively for all 3 executors.
# executor 1 (7gb): []
# executor 2 (7gb): []
# executor 3 (8gb): []
# we also take into consideration memory overhead for each
# executor which is `memory per executor` - `max(384mb, 10 % of spark.executor.memory)`
# after calculation our executor memories will now have the ff.
# 7000mb - max(384mb, 10% of 7gb is 0.7gb or 700mb) = 6300mb or 6.3gb
# 7000mb - max(384mb, 10% of 7gb is 0.7gb or 700mb) = 6300mb or 6.3gb
# 8000mb - max(384mb, 10% of 8gb is 0.8gb or 800mb) = 7200mb or 7.2gb
# executor 1 (6.3gb): []
# executor 2 (6.3gb): []
# executor 3 (7.2gb): []
# since we havee 7 cores we can divide these cores across all these
# executors. if distributed evenly each executor will have 2, 2, and 3
# cores respectively


# driver memory default is 1g
# executor memory default is 1g
# executor cores default is 1
# sql execution arrow maxRecordsPerBatch default 10000 
# maximum number of records that can be written to a single ArrowRecordBatch in memory

# spark = SparkSession.builder.appName("app")\
    # .config("spark.driver.memory", "16g")\
    # .config("spark.executor.memory", "4g")\
    # .config("spark.executor.cores", "2")\
    # .config("spark.executor.instances", "3")\
    # .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100")\
    # .getOrCreate()

spark = SparkSession.builder.appName("app")\
    .getOrCreate()

In [0]:
# cloud
BRONZE_FOLDER_NAME = "sgppipelinesa-bronze"
URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/"
BRONZE_DATA_PATH = URL.format(FOLDER_NAME=BRONZE_FOLDER_NAME)
BRONZE_DATA_PATH

# local
# BRONZE_FOLDER_NAME = "bronze/"
# DATA_PATH = "../include/data/"
# BRONZE_DATA_PATH = os.path.join(DATA_PATH, BRONZE_FOLDER_NAME)

('abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/',
 [FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/', name='1028-20100710-hne/', size=0, modificationTime=1753875440000),
  FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/', name='1337ad-20170321-ajg/', size=0, modificationTime=1753875338000),
  FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-tkg/', name='1337ad-20170321-tkg/', size=0, modificationTime=1753948729000),
  FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1snoke-20120412-hge/', name='1snoke-20120412-hge/', size=0, modificationTime=1754033527000),
  FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/23yipikaye-20100807-ujm/', name='23yipikaye-20100807-ujm/', size=0, modificationTime=1754033526000),
  FileInfo(path='abfss://sgppipelinesa-bronze@sgppipeline

In [0]:
# sample_folder = folder_infos[-1].path
# sample_folder

In [0]:
# sample_folder.strip('/').split('/')[-1]

In [0]:
# type(folder_infos[-1])

In [0]:
# dbutils.fs.ls(folder_infos[-1].path)

In [0]:
# cloud
file_infos = [file_info.path for file_info in dbutils.fs.ls(BRONZE_DATA_PATH)]
file_infos

# local
# file_infos = os.listdir(BRONZE_DATA_PATH)
# file_infos

['abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-tkg/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1snoke-20120412-hge/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/23yipikaye-20100807-ujm/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Aaron-20080318-kdl/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Anniepoo-20140308-bft/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Anniepoo-20140308-cqj/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Anniepoo-20140308-fcp/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Anniepoo-20140308-hns/',
 'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/Anniepoo-20140308-nky/',
 'abfss://

In [0]:
# labels_df = spark.read.format('text')\
#     .option("lineSep", "\n")\
#     .load(os.path.join(BRONZE_DATA_PATH, "1337ad-20170321-ajg", "etc", "README"))
# labels_df.show()

In [0]:
# cloud
labels_df = spark.read.format("text")\
    .option("lineSep", "\n")\
    .load([os.path.join(file_info, "etc", "README") for file_info in file_infos])\
    .select("*", "_metadata.file_path")

# local
# labels_df = spark.read.format("text")\
#     .option("lineSep", "\n")\
#     .load([os.path.join(BRONZE_DATA_PATH, file_info, "etc", "README") for file_info in file_infos])
labels_df.show()

+--------------------+--------------------+
|               value|           file_path|
+--------------------+--------------------+
|    User Name:1337ad|abfss://sgppipeli...|
|                    |abfss://sgppipeli...|
|Speaker Character...|abfss://sgppipeli...|
|                    |abfss://sgppipeli...|
|      Gender: Female|abfss://sgppipeli...|
|    Age Range: Adult|abfss://sgppipeli...|
|        Language: EN|abfss://sgppipeli...|
|Pronunciation dia...|abfss://sgppipeli...|
|                    |abfss://sgppipeli...|
|Recording Informa...|abfss://sgppipeli...|
|                    |abfss://sgppipeli...|
|Microphone make: n/a|abfss://sgppipeli...|
|Microphone type: ...|abfss://sgppipeli...|
|Audio card make: ...|abfss://sgppipeli...|
|Audio card type: ...|abfss://sgppipeli...|
|Audio Recording S...|abfss://sgppipeli...|
|                O/S:|abfss://sgppipeli...|
|                    |abfss://sgppipeli...|
|          File Info:|abfss://sgppipeli...|
|                    |abfss://sg

In [0]:
# labels_df.count()

In [0]:
# labels_df.withColumn("filePath", F.input_file_name()).show()

In [0]:
# labels_df.withColumn("filePath", F.input_file_name()).where(
#     F.lower(F.col("value")).contains("gender")
# ).collect()

In [0]:
# local
# labels_df = labels_df.withColumn("filePath", F.input_file_name())

In [0]:
labels_df = labels_df.where(F.lower(F.col("value")).contains("gender"))
# labels_df.show()

# Clean value columns

In [0]:
labels_df = labels_df.withColumn(
    "value", 
    # extract only the gender of the subject in meta data
    F.regexp_replace(
        F.lower(F.col("value")), 
        r"(gender)|[:;\[\]\t\n\s]+", 
        ""
    )
)
# labels_df.show()

In [0]:
labels_df = labels_df.withColumn(
    "value",
    # sometimes the gender may be in a different language
    # e.g. the 'male' in german may have the string start
    # with 'mä' so we should return male if this is the case
    # and vice versa for females translated to a different
    # language 
    F.when(
        F.col("value").startswith("ma") | F.col("value").startswith("mä"),
        "male"
    ).when(
        F.col("value").startswith("fem") | F.col("value").startswith("wei"),
        "female"
    ).otherwise(
        "unknown"
    )
)
# labels_df.show()

# clean filePath column

In [0]:
# labels_df.withColumn(
#     "subjectId",
#     F.element_at(
#         # splits the filepath from 'file:///c:/Users/LARRY/Documents/Scripts/.../bronze/1337ad-20170321-ajg/etc/README
#         # to array of the directory tree of the files path e.g. 
#         # ['file:', ..., 'Scripts', ..., 'bronze', '<subject id>, 'etc', 'readme']
#         # so in order to extract subject id or the file name we have to 
#         # get the 3rd to the last element
#         F.split(
#             F.col("file_path"),
#             r"\/"
#         ),
#         -3
#     )
# ).collect()

In [0]:
labels_df = labels_df.withColumn(
    "subjectId",
    F.element_at(
        # splits the filepath from 'file:///c:/Users/LARRY/Documents/Scripts/.../bronze/1337ad-20170321-ajg/etc/README
        # to array of the directory tree of the files path e.g. 
        # ['file:', ..., 'Scripts', ..., 'bronze', '<subject id>, 'etc', 'readme']
        # so in order to extract subject id or the file name we have to 
        # get the 3rd to the last element
        F.split(
            F.col("file_path"),
            r"\/"
        ),
        -3
    )
)
labels_df.show()

+------+--------------------+--------------------+
| value|           file_path|           subjectId|
+------+--------------------+--------------------+
|female|abfss://sgppipeli...| 1337ad-20170321-ajg|
|female|abfss://sgppipeli...| 1337ad-20170321-tkg|
|female|abfss://sgppipeli...|Anniepoo-20140308...|
|female|abfss://sgppipeli...|Anniepoo-20140308...|
|female|abfss://sgppipeli...|Anniepoo-20140308...|
|female|abfss://sgppipeli...|Anniepoo-20140308...|
|female|abfss://sgppipeli...|Anniepoo-20140308...|
|  male|abfss://sgppipeli...|  Aaron-20080318-kdl|
|  male|abfss://sgppipeli...| 1snoke-20120412-hge|
|  male|abfss://sgppipeli...|  Coren-20141121-pxp|
|  male|abfss://sgppipeli...|   1028-20100710-hne|
|  male|abfss://sgppipeli...|23yipikaye-201008...|
+------+--------------------+--------------------+



In [0]:
# cloud
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-01"
SILVER_DATA_DIR = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
SILVER_DATA_DIR

# local
# SILVER_FOLDER_NAME = "silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_DIR = os.path.join(DATA_DIR, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME))
# SILVER_DATA_DIR

'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-01'

In [0]:
labels_df.write.mode("overwrite").parquet(f"{SILVER_DATA_DIR}/labels.parquet")