### Spark notebook ###

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

**Steps**

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [11]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas as pd
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=10):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [12]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.app.name,kda115 (jupyter)
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.port,39021
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.app.id,app-20241021191417-0120
spark.ui.port,4088


## Question 02 

In [13]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

### A. Load the audio feature attribute names and types from the audio/attributes directory 

The Attribute DataFrame contain columns like of Attribute Name and AttributeType, which describe the feature names and the corresponding types

In [14]:
# Create a dictionary for mapping attribute types
dict_type = {
    'float': DoubleType(),
    'string': StringType(),
    'STRING': StringType(),
    'NUMERIC': DoubleType(),
    'real': DoubleType()
}

In [15]:
# Pick one dataset to create a schema 
name = 'msd-jmir-area-of-moments-all-v1.0'

In [16]:
# Define a function to create a schema from a DataFrame containing attribute names and types
def generate_schema(attribute_df):
    '''Generate a schema based on attribute information about feature names and types'''

    # Initialize an empty list to hold StructField objects
    struct_fields = []

    # Loop through each row in the DataFrame
    for attribute_info in attribute_df.collect():
        
        # Fetch the data type from the type_dict mapping based on the attribute type
        data_type = dict_type[attribute_info['type']]
        
        # Create a StructField for each attribute name and its corresponding type
        field = StructField(attribute_info['name'], data_type, True)
        
        # Append the StructField to the list
        struct_fields.append(field)
        
    # Return a StructType created from the list of StructField objects
    return StructType(struct_fields)

#### A.1  Create schema for feature and load feature dataset

In [17]:
# Set directory path containing attribute and feature files
attribute_path = f'hdfs:///data/msd/audio/attributes/{name}.attributes.csv'
feature_path = f'hdfs:///data/msd/audio/features/{name}.csv'
    
# Load 'attribute' dataset  
attribute_data = (spark.read.format('csv')
                  .option('header', False)
                  .load(attribute_path)
                  .toDF('name', 'type'))

# Create a schema for feature using attribute name and type
feature_schema = generate_schema(attribute_data)

# show the result 
feature_schema

StructType(List(StructField(Area_Method_of_Moments_Overall_Standard_Deviation_1,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_2,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_3,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_4,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_5,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_6,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_7,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_8,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_9,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Standard_Deviation_10,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Average_1,DoubleType,true),StructField(Area_Method_of_Moments_Overall_Average_2,DoubleType,true),StructField(Area_Method_of_Moment

In [18]:
# Load 'feature' dataset  
feature_data = (spark.read.format('csv')
                .option('header', False)
                .option('inferSchema', False)
                .option('quote', "'")  # Ensures quotes ' ' in CSV are treated as delimiters, not data
                .schema(feature_schema)
                .load(feature_path))
    
    
# Show the result
print(f'Feature data corresponding to {name} dataset:')
show_as_html(feature_data, 5)

Feature data corresponding to msd-jmir-area-of-moments-all-v1.0 dataset:


Unnamed: 0,Area_Method_of_Moments_Overall_Standard_Deviation_1,Area_Method_of_Moments_Overall_Standard_Deviation_2,Area_Method_of_Moments_Overall_Standard_Deviation_3,Area_Method_of_Moments_Overall_Standard_Deviation_4,Area_Method_of_Moments_Overall_Standard_Deviation_5,Area_Method_of_Moments_Overall_Standard_Deviation_6,Area_Method_of_Moments_Overall_Standard_Deviation_7,Area_Method_of_Moments_Overall_Standard_Deviation_8,Area_Method_of_Moments_Overall_Standard_Deviation_9,Area_Method_of_Moments_Overall_Standard_Deviation_10,...,Area_Method_of_Moments_Overall_Average_2,Area_Method_of_Moments_Overall_Average_3,Area_Method_of_Moments_Overall_Average_4,Area_Method_of_Moments_Overall_Average_5,Area_Method_of_Moments_Overall_Average_6,Area_Method_of_Moments_Overall_Average_7,Area_Method_of_Moments_Overall_Average_8,Area_Method_of_Moments_Overall_Average_9,Area_Method_of_Moments_Overall_Average_10,MSD_TRACKID
0,1.2,3355.0,26270.0,39850000.0,309600000.0,2403000000.0,874900000000.0,2775000000.0,21550000000.0,406400000000000.0,...,5746.0,43470.0,-44220000.0,-337600000.0,-2576000000.0,766500000000.0,3015000000.0,23020000000.0,346000000000000.0,TRHFHQZ12903C9E2D5
1,0.9295,6720.0,44100.0,160800000.0,1060000000.0,6985000000.0,7095000000000.0,9545000000.0,62930000000.0,2037000000000000.0,...,11580.0,74040.0,-179200000.0,-1153000000.0,-7420000000.0,6242000000000.0,10370000000.0,66800000000.0,1694000000000000.0,TRHFHYX12903CAF953
2,1.883,6712.0,49060.0,160600000.0,1176000000.0,8609000000.0,7083000000000.0,10580000000.0,77440000000.0,2781000000000000.0,...,11580.0,85200.0,-179100000.0,-1316000000.0,-9660000000.0,6233000000000.0,11820000000.0,86800000000.0,2463000000000000.0,TRHFHAU128F9341A0E
3,1.884,6722.0,56130.0,161000000.0,1346000000.0,11270000000.0,7112000000000.0,12110000000.0,101400000000.0,4193000000000000.0,...,11600.0,93320.0,-179700000.0,-1459000000.0,-11850000000.0,6262000000000.0,13110000000.0,106600000000.0,3432000000000000.0,TRHFHLP128F14947A7
4,1.52,6709.0,53230.0,160500000.0,1295000000.0,10450000000.0,7076000000000.0,11640000000.0,93920000000.0,3751000000000000.0,...,11580.0,93650.0,-179000000.0,-1441000000.0,-11590000000.0,6230000000000.0,12930000000.0,104100000000.0,3248000000000000.0,TRHFHFF128F930AC11


### B. Load more of the audio feature attribute datasets from the audio/features directory using the generated schema 

In [15]:
# List of dataset names
dataset_names = [
    'msd-jmir-area-of-moments-all-v1.0',
    'msd-jmir-lpc-all-v1.0', 
    'msd-jmir-methods-of-moments-all-v1.0', 
    'msd-jmir-mfcc-all-v1.0',
    'msd-jmir-spectral-all-all-v1.0',
    'msd-jmir-spectral-derivatives-all-all-v1.0',
    'msd-marsyas-timbral-v1.0',
    'msd-mvd-v1.0',
    'msd-rh-v1.0',
    'msd-rp-v1.0',
    'msd-ssd-v1.0',
    'msd-trh-v1.0',
    'msd-tssd-v1.0',   
]

# Loop through each dataset name
for dataset_name in dataset_names:
    
    # Set paths dynamically for the attribute and feature files
    attribute_path = f'hdfs:///data/msd/audio/attributes/{dataset_name}.attributes.csv'
    feature_path = f'hdfs:///data/msd/audio/features/{dataset_name}.csv'
    
    # Load attribute data to create the schema
    attribute_data = (spark.read.format('csv')
                      .option('header', False)
                      .load(attribute_path)
                      .toDF('name', 'type'))  # Ensure the column names are consistent
    
    # Create schema using the attribute data
    feature_schema = generate_schema(attribute_data)
    
    # Load feature data using the generated schema
    feature_data = (spark.read.format('csv')
                    .option('header', False)
                    .option('inferSchema', False)
                    .option('quote', '')  # Ensures quotes in CSV are handled correctly
                    .schema(feature_schema)
                    .load(feature_path))
    
    # Show the result for each dataset
    print(f'Feature data corresponding to dataset name: {dataset_name}:')
    show_as_html(feature_data, 5)

Feature data corresponding to dataset name: msd-jmir-area-of-moments-all-v1.0:


Unnamed: 0,Area_Method_of_Moments_Overall_Standard_Deviation_1,Area_Method_of_Moments_Overall_Standard_Deviation_2,Area_Method_of_Moments_Overall_Standard_Deviation_3,Area_Method_of_Moments_Overall_Standard_Deviation_4,Area_Method_of_Moments_Overall_Standard_Deviation_5,Area_Method_of_Moments_Overall_Standard_Deviation_6,Area_Method_of_Moments_Overall_Standard_Deviation_7,Area_Method_of_Moments_Overall_Standard_Deviation_8,Area_Method_of_Moments_Overall_Standard_Deviation_9,Area_Method_of_Moments_Overall_Standard_Deviation_10,...,Area_Method_of_Moments_Overall_Average_2,Area_Method_of_Moments_Overall_Average_3,Area_Method_of_Moments_Overall_Average_4,Area_Method_of_Moments_Overall_Average_5,Area_Method_of_Moments_Overall_Average_6,Area_Method_of_Moments_Overall_Average_7,Area_Method_of_Moments_Overall_Average_8,Area_Method_of_Moments_Overall_Average_9,Area_Method_of_Moments_Overall_Average_10,MSD_TRACKID
0,1.2,3355.0,26270.0,39850000.0,309600000.0,2403000000.0,874900000000.0,2775000000.0,21550000000.0,406400000000000.0,...,5746.0,43470.0,-44220000.0,-337600000.0,-2576000000.0,766500000000.0,3015000000.0,23020000000.0,346000000000000.0,'TRHFHQZ12903C9E2D5'
1,0.9295,6720.0,44100.0,160800000.0,1060000000.0,6985000000.0,7095000000000.0,9545000000.0,62930000000.0,2037000000000000.0,...,11580.0,74040.0,-179200000.0,-1153000000.0,-7420000000.0,6242000000000.0,10370000000.0,66800000000.0,1694000000000000.0,'TRHFHYX12903CAF953'
2,1.883,6712.0,49060.0,160600000.0,1176000000.0,8609000000.0,7083000000000.0,10580000000.0,77440000000.0,2781000000000000.0,...,11580.0,85200.0,-179100000.0,-1316000000.0,-9660000000.0,6233000000000.0,11820000000.0,86800000000.0,2463000000000000.0,'TRHFHAU128F9341A0E'
3,1.884,6722.0,56130.0,161000000.0,1346000000.0,11270000000.0,7112000000000.0,12110000000.0,101400000000.0,4193000000000000.0,...,11600.0,93320.0,-179700000.0,-1459000000.0,-11850000000.0,6262000000000.0,13110000000.0,106600000000.0,3432000000000000.0,'TRHFHLP128F14947A7'
4,1.52,6709.0,53230.0,160500000.0,1295000000.0,10450000000.0,7076000000000.0,11640000000.0,93920000000.0,3751000000000000.0,...,11580.0,93650.0,-179000000.0,-1441000000.0,-11590000000.0,6230000000000.0,12930000000.0,104100000000.0,3248000000000000.0,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-lpc-all-v1.0:


Unnamed: 0,LPC_Overall_Standard_Deviation_1,LPC_Overall_Standard_Deviation_2,LPC_Overall_Standard_Deviation_3,LPC_Overall_Standard_Deviation_4,LPC_Overall_Standard_Deviation_5,LPC_Overall_Standard_Deviation_6,LPC_Overall_Standard_Deviation_7,LPC_Overall_Standard_Deviation_8,LPC_Overall_Standard_Deviation_9,LPC_Overall_Standard_Deviation_10,...,LPC_Overall_Average_2,LPC_Overall_Average_3,LPC_Overall_Average_4,LPC_Overall_Average_5,LPC_Overall_Average_6,LPC_Overall_Average_7,LPC_Overall_Average_8,LPC_Overall_Average_9,LPC_Overall_Average_10,MSD_TRACKID
0,0.04652,0.1125,0.1386,0.1829,0.09227,0.1261,0.06775,0.1168,0.08311,0.0,...,0.6449,-0.03497,-0.178,-0.2779,0.1096,-0.1404,0.02397,-0.1252,0.0,'TRMMMYQ128F932D901'
1,0.05153,0.1267,0.163,0.1528,0.1127,0.1794,0.07344,0.08478,0.06295,0.0,...,0.7336,0.1151,-0.2112,-0.2369,0.1443,-0.0817,-0.0135,-0.1316,0.0,'TRMMMKD128F425225D'
2,0.04112,0.1182,0.1352,0.09433,0.1448,0.1185,0.1071,0.08826,0.1259,0.0,...,0.3939,-0.2454,-0.08293,-0.32,0.1167,-0.08704,-0.1001,-0.1458,0.0,'TRMMMRX128F93187D9'
3,0.08796,0.1409,0.1781,0.1308,0.1569,0.1263,0.1265,0.1007,0.105,0.0,...,0.459,-0.2506,0.01747,-0.2836,0.2195,-0.05511,-0.02631,-0.09313,0.0,'TRMMMCH128F425532C'
4,0.02573,0.1512,0.1432,0.1431,0.1065,0.1196,0.07276,0.08836,0.05992,0.0,...,0.6478,0.09408,0.06476,-0.01578,0.06642,0.004455,-0.01168,-0.03808,0.0,'TRMMMWA128F426B589'


Feature data corresponding to dataset name: msd-jmir-methods-of-moments-all-v1.0:


Unnamed: 0,Method_of_Moments_Overall_Standard_Deviation_1,Method_of_Moments_Overall_Standard_Deviation_2,Method_of_Moments_Overall_Standard_Deviation_3,Method_of_Moments_Overall_Standard_Deviation_4,Method_of_Moments_Overall_Standard_Deviation_5,Method_of_Moments_Overall_Average_1,Method_of_Moments_Overall_Average_2,Method_of_Moments_Overall_Average_3,Method_of_Moments_Overall_Average_4,Method_of_Moments_Overall_Average_5,MSD_TRACKID
0,0.1545,13.11,840.0,41080.0,7108000.0,0.319,33.41,1371.0,64240.0,8398000.0,'TRHFHQZ12903C9E2D5'
1,0.1195,13.02,611.9,43880.0,7226000.0,0.2661,30.26,1829.0,183800.0,31230000.0,'TRHFHYX12903CAF953'
2,0.2326,7.185,362.2,19890.0,3030000.0,0.8854,32.68,1384.0,79190.0,9862000.0,'TRHFHAU128F9341A0E'
3,0.2283,10.3,463.8,24730.0,3336000.0,0.4321,37.56,2047.0,197200.0,32930000.0,'TRHFHLP128F14947A7'
4,0.1841,8.544,359.4,21900.0,3359000.0,0.8438,36.36,2008.0,205400.0,35390000.0,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-mfcc-all-v1.0:


Unnamed: 0,MFCC_Overall_Standard_Deviation_1,MFCC_Overall_Standard_Deviation_2,MFCC_Overall_Standard_Deviation_3,MFCC_Overall_Standard_Deviation_4,MFCC_Overall_Standard_Deviation_5,MFCC_Overall_Standard_Deviation_6,MFCC_Overall_Standard_Deviation_7,MFCC_Overall_Standard_Deviation_8,MFCC_Overall_Standard_Deviation_9,MFCC_Overall_Standard_Deviation_10,...,MFCC_Overall_Average_5,MFCC_Overall_Average_6,MFCC_Overall_Average_7,MFCC_Overall_Average_8,MFCC_Overall_Average_9,MFCC_Overall_Average_10,MFCC_Overall_Average_11,MFCC_Overall_Average_12,MFCC_Overall_Average_13,MSD_TRACKID
0,59.28,4.15,5.105,2.275,2.185,1.95,1.603,1.335,1.3,1.396,...,1.033,-0.3445,-0.4305,0.1372,0.0312,-0.3132,-0.0815,0.7213,-0.405,'TRHFHQZ12903C9E2D5'
1,46.35,6.888,4.652,4.131,3.225,2.826,2.108,1.922,1.95,1.697,...,-0.2149,3.74,0.2908,1.92,0.125,1.242,-0.5187,0.2548,-0.4073,'TRHFHYX12903CAF953'
2,38.63,3.041,2.504,2.141,1.853,1.906,1.867,1.547,1.491,1.468,...,1.035,1.283,1.408,0.5908,0.4882,0.5522,-0.3168,-0.5887,0.03743,'TRHFHAU128F9341A0E'
3,33.49,5.009,4.56,3.153,2.383,2.4,2.113,1.985,2.075,1.827,...,-1.351,2.264,-0.2145,0.3408,-0.604,1.261,-1.527,0.02701,-0.7334,'TRHFHLP128F14947A7'
4,37.43,4.107,3.167,2.793,2.158,1.926,1.814,1.598,1.622,1.547,...,-1.063,1.167,0.0559,1.682,-0.6607,1.038,-0.1167,0.462,-0.3687,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-spectral-all-all-v1.0:


Unnamed: 0,Spectral_Centroid_Overall_Standard_Deviation_1,Spectral_Rolloff_Point_Overall_Standard_Deviation_1,Spectral_Flux_Overall_Standard_Deviation_1,Compactness_Overall_Standard_Deviation_1,Spectral_Variability_Overall_Standard_Deviation_1,Root_Mean_Square_Overall_Standard_Deviation_1,Fraction_Of_Low_Energy_Windows_Overall_Standard_Deviation_1,Zero_Crossings_Overall_Standard_Deviation_1,Spectral_Centroid_Overall_Average_1,Spectral_Rolloff_Point_Overall_Average_1,Spectral_Flux_Overall_Average_1,Compactness_Overall_Average_1,Spectral_Variability_Overall_Average_1,Root_Mean_Square_Overall_Average_1,Fraction_Of_Low_Energy_Windows_Overall_Average_1,Zero_Crossings_Overall_Average_1,MSD_TRACKID
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,'TRHFHQZ12903C9E2D5'
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,'TRHFHYX12903CAF953'
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,'TRHFHAU128F9341A0E'
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,'TRHFHLP128F14947A7'
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-spectral-derivatives-all-all-v1.0:


Unnamed: 0,Spectral_Centroid_Overall_Standard_Deviation_1,Spectral_Rolloff_Point_Overall_Standard_Deviation_1,Spectral_Flux_Overall_Standard_Deviation_1,Compactness_Overall_Standard_Deviation_1,Spectral_Variability_Overall_Standard_Deviation_1,Root_Mean_Square_Overall_Standard_Deviation_1,Fraction_Of_Low_Energy_Windows_Overall_Standard_Deviation_1,Zero_Crossings_Overall_Standard_Deviation_1,Spectral_Centroid_Overall_Average_1,Spectral_Rolloff_Point_Overall_Average_1,Spectral_Flux_Overall_Average_1,Compactness_Overall_Average_1,Spectral_Variability_Overall_Average_1,Root_Mean_Square_Overall_Average_1,Fraction_Of_Low_Energy_Windows_Overall_Average_1,Zero_Crossings_Overall_Average_1,MSD_TRACKID
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,'TRHFHQZ12903C9E2D5'
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,'TRHFHYX12903CAF953'
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,'TRHFHAU128F9341A0E'
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,'TRHFHLP128F14947A7'
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-marsyas-timbral-v1.0:


Unnamed: 0,Mean_Acc5_Mean_Mem20_ZeroCrossings_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_Centroid_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_Rolloff_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_Flux_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC0_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC1_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC2_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC3_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC4_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Mean_Acc5_Mean_Mem20_MFCC5_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,...,Std_Acc5_Std_Mem20_PeakRatio_Chroma_D_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_D#_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_E_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_F_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_F#_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_G_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Chroma_G#_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Average_Chroma_A_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,Std_Acc5_Std_Mem20_PeakRatio_Minimum_Chroma_A_Power_powerFFT_WinHamming_HopSize512_WinSize512_Sum_AudioCh0,track_id
0,0.112178,0.088561,0.210064,0.094495,-43.977376,3.548018,0.346619,0.580689,-0.226928,0.21863,...,0.000976,0.001009,0.001089,0.001077,0.001037,0.000982,0.001076,0.277586,20.106226,TRSUFWB128F4255BAE
1,0.133675,0.100968,0.229477,0.087755,-42.694424,3.856577,-0.623697,0.510768,0.157851,-0.191616,...,0.00049,0.00048,0.000506,0.000496,0.000461,0.000448,0.000425,0.256402,2.877227,TRSUFSW128F4284B04
2,0.198948,0.17707,0.292963,0.087348,-38.52392,0.690861,-2.264458,1.314971,0.207455,0.143965,...,0.000351,0.000282,0.000288,0.000292,0.000281,0.000289,0.000375,0.080737,0.524488,TRSUFUP128F42561A2
3,0.086491,0.061144,0.132627,0.094506,-43.728943,3.964348,0.94362,1.687271,0.44188,0.541842,...,0.001156,0.001039,0.000951,0.000896,0.000822,0.00077,0.000847,0.277738,7.509386,TRSUFWL128F42956C9
4,0.17056,0.206775,0.451917,0.108458,-62.959637,1.115836,0.030579,-0.110063,-0.17779,0.222752,...,0.001632,0.001724,0.001587,0.001472,0.001094,0.001588,0.001523,0.51268,80.313907,TRSUFQR12903CD493C


Feature data corresponding to dataset name: msd-mvd-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_411,component_412,component_413,component_414,component_415,component_416,component_417,component_418,component_419,instanceName
0,0.226884,0.230364,0.219057,0.175083,0.169087,0.168407,0.138525,0.124724,0.152904,0.123112,...,0.146509,0.147567,0.121105,0.132596,0.144324,0.116596,0.168728,0.157645,0.110822,'TRYWDAH128F92D4539'
1,0.214838,0.181556,0.107927,0.110481,0.120849,0.104712,0.086065,0.068917,0.091953,0.073461,...,0.080255,0.087641,0.098576,0.07729,0.089487,0.068734,0.103242,0.091347,0.073943,'TRJVUJL128C71968F1'
2,0.599207,0.462929,0.334623,0.235651,0.180505,0.192023,0.162616,0.147555,0.147512,0.150728,...,0.131198,0.137329,0.117605,0.111755,0.113637,0.101259,0.091637,0.107954,0.094497,'TRHNLNG128F42717FF'
3,0.161975,0.131392,0.147728,0.146141,0.12519,0.131663,0.127299,0.124623,0.098403,0.16116,...,0.145621,0.153424,0.137743,0.09089,0.105781,0.121976,0.122234,0.114681,0.122273,'TRIDGZT128F428B9F5'
4,0.511186,0.445876,0.395141,0.328717,0.297649,0.20564,0.260945,0.189948,0.317763,0.215753,...,0.172113,0.161366,0.15987,0.17663,0.197752,0.165474,0.149154,0.173748,0.25931,'TRCNVJH128F427213A'


Feature data corresponding to dataset name: msd-rh-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_51,component_52,component_53,component_54,component_55,component_56,component_57,component_58,component_59,instanceName
0,5.965746,5.633224,5.295259,4.188206,4.023376,3.51561,3.344197,3.3399,3.610377,3.100263,...,1.176926,1.383768,1.196357,1.123239,1.292873,1.178216,1.002451,1.203526,0.907604,'TRYWDAH128F92D4539'
1,4.853912,3.913811,2.286072,2.583405,2.823458,2.354543,2.086963,1.679639,2.226083,1.931475,...,0.695435,0.759757,0.835221,0.742432,0.689854,0.628478,0.72213,0.666933,0.678972,'TRJVUJL128C71968F1'
2,3.923767,2.956465,3.671716,3.08399,2.879255,2.786314,3.120237,2.632083,2.083783,4.158884,...,1.129439,1.191363,1.285582,0.884632,0.889719,1.008565,1.051691,0.9444,1.30365,'TRIDGZT128F428B9F5'
3,13.195897,10.883453,8.830368,6.359973,4.522331,4.36811,3.778535,3.662791,3.603957,3.935505,...,0.945969,0.957197,0.878725,0.957463,0.968238,0.817337,0.753092,0.860108,0.817529,'TRHNLNG128F42717FF'
4,13.785449,10.22963,8.138473,7.748904,7.104238,4.991848,6.260711,4.912233,7.58917,4.993329,...,1.90136,1.805084,1.58057,1.646917,2.346912,1.52814,1.835398,1.661893,2.697921,'TRCNVJH128F427213A'


Feature data corresponding to dataset name: msd-rp-v1.0:


Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_1432,component_1433,component_1434,component_1435,component_1436,component_1437,component_1438,component_1439,component_1440,instanceName
0,0.002736,0.003725,0.007415,0.002806,0.008469,0.007994,0.014948,0.008144,0.01197,0.013622,...,0.026226,0.017471,0.004238,0.009438,0.004225,0.005165,0.00521,0.003043,1e-06,TRYWDAH128F92D4539
1,0.001376,0.012098,0.004903,0.009153,0.009236,0.005353,0.003515,0.005018,0.006722,0.006616,...,0.007779,0.013003,0.012642,0.009386,0.005285,0.003885,0.002716,0.000712,1e-06,TRJVUJL128C71968F1
2,0.006036,0.006173,0.00735,0.008331,0.007874,0.008664,0.007132,0.008543,0.006113,0.007539,...,0.027203,0.022541,0.016961,0.027418,0.021692,0.017372,0.019103,0.011734,1e-06,TRIDGZT128F428B9F5
3,0.018246,0.008646,0.021885,0.041223,0.004092,0.080736,0.041131,0.051823,0.020586,0.033808,...,0.009745,0.006665,0.01078,0.003438,0.003464,0.003668,0.004031,0.001271,1e-06,TRHNLNG128F42717FF
4,0.01251,0.018594,0.027447,0.036388,0.023727,0.038141,0.031223,0.038639,0.025802,0.022678,...,0.060113,0.062213,0.075493,0.061272,0.025676,0.020332,0.020932,0.011308,0.002284,TRCNVJH128F427213A


Feature data corresponding to dataset name: msd-ssd-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_159,component_160,component_161,component_162,component_163,component_164,component_165,component_166,component_167,instanceName
0,3.340253,3.184129,3.467989,3.961744,3.802365,4.048197,3.617043,3.335274,3.111133,2.560841,...,9.814441,12.419846,9.361515,7.508801,6.970781,4.840366,4.327801,2.079462,0.001066,'TRJAPJV128F147E501'
1,3.859655,3.169665,2.339668,2.379815,2.087785,3.719538,3.033265,2.719684,2.258551,2.830153,...,9.991244,7.845734,8.7121,4.766188,3.607995,3.487868,2.994789,1.304786,0.001066,'TRKXUVI12903CF164C'
2,2.366181,3.693143,3.689352,3.922064,3.597496,3.675475,3.36607,3.14971,2.901223,2.544667,...,12.181783,9.146891,7.610539,6.840762,5.609022,4.445043,3.850136,2.175525,0.412306,'TRVSMHB128F933868B'
3,2.049054,4.118874,3.839635,3.426112,2.437866,2.808449,2.486959,2.366762,2.135613,1.633703,...,6.9143,7.312521,6.697511,6.286495,5.046462,3.163909,2.540429,1.371564,0.001066,'TRIIFIJ128F425D096'
4,4.407492,4.824841,4.080729,4.589847,3.908415,3.944584,4.569623,4.178122,4.202005,3.411257,...,18.883988,18.684402,12.268226,9.667586,7.543367,9.391542,11.664551,6.619618,1.100809,'TRKKTRL128F933DE30'


Feature data corresponding to dataset name: msd-trh-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_411,component_412,component_413,component_414,component_415,component_416,component_417,component_418,component_419,instanceName
0,7.663735,6.981061,8.467507,6.571336,5.63396,8.180509,4.314876,4.100904,5.979875,3.670057,...,2.095989,1.726187,2.30842,2.044628,1.671321,2.294742,1.787176,1.507227,2.970258,'TRBRYCM128F42681FE'
1,17.145029,13.643089,10.666042,8.857338,8.446842,10.066524,6.72611,6.230044,6.250445,4.958765,...,1.68291,1.727391,1.796884,1.742186,1.518526,1.72243,1.79298,1.530533,1.903642,'TRSZPGP128F1456CE3'
2,14.535889,11.364023,8.664688,9.72017,8.196153,5.545929,6.525569,5.897498,12.678313,5.139655,...,3.55384,3.645958,5.870902,4.13836,3.004171,2.786075,3.196151,2.980204,2.81041,'TRUHNXZ128F1496B8F'
3,6.608727,5.889982,5.242445,4.54118,5.458274,3.798462,4.941622,5.36169,3.752878,3.690395,...,4.811908,3.375773,3.118121,2.358886,4.413413,2.764471,5.206478,2.746375,3.467102,'TRQODPB128F934C699'
4,9.719172,11.088511,18.8139,6.392022,7.077559,7.246826,4.931072,6.396743,6.104538,5.80938,...,2.832877,3.334148,3.4537,2.614119,2.446143,2.813223,3.107177,2.926267,2.421697,'TRMNUKS128F4230ED5'


Feature data corresponding to dataset name: msd-tssd-v1.0:


Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_1168,component_1169,component_1170,component_1171,component_1172,component_1173,component_1174,component_1175,component_1176,instanceName
0,3.340253,3.184129,3.467989,3.961744,3.802365,4.048197,3.617043,3.335274,3.111133,2.560841,...,11.328043,17.813001,11.465108,8.98148,7.833965,5.010287,5.28922,2.32615,0.001066,TRJAPJV128F147E501
1,3.859655,3.169665,2.339668,2.379815,2.087785,3.719538,3.033265,2.719684,2.258551,2.830153,...,13.345222,9.088343,8.852256,4.873721,3.655026,3.641957,3.029004,1.315704,0.001066,TRKXUVI12903CF164C
2,2.366181,3.693143,3.689352,3.922064,3.597496,3.675475,3.36607,3.14971,2.901223,2.544667,...,14.520047,10.333855,8.751075,9.64756,7.675982,5.67275,5.248331,2.370116,0.444731,TRVSMHB128F933868B
3,2.049054,4.118874,3.839635,3.426112,2.437866,2.808449,2.486959,2.366762,2.135613,1.633703,...,8.493671,8.592187,8.018544,9.971325,6.184466,4.542195,4.012239,2.22955,0.001066,TRIIFIJ128F425D096
4,4.407492,4.824841,4.080729,4.589847,3.908415,3.944584,4.569623,4.178122,4.202005,3.411257,...,22.558385,19.272145,13.098851,11.940146,8.060847,10.481845,15.760706,7.519517,1.205712,TRKKTRL128F933DE30


### C. Do you think these audio feature attribute names are convenient to use as column names?

### C.1 Develop a systematic way to rename columns in the audio feature datasets after you load them in Spark 

In [24]:
# List of dataset names
dataset_names = [
    'msd-jmir-area-of-moments-all-v1.0',
    'msd-jmir-lpc-all-v1.0', 
    'msd-jmir-methods-of-moments-all-v1.0', 
    'msd-jmir-mfcc-all-v1.0',
    'msd-jmir-spectral-all-all-v1.0',
    'msd-jmir-spectral-derivatives-all-all-v1.0',
    'msd-marsyas-timbral-v1.0',
    'msd-mvd-v1.0',
    'msd-rh-v1.0',
    'msd-rp-v1.0',
    'msd-ssd-v1.0',
    'msd-trh-v1.0',
    'msd-tssd-v1.0',   
]

# Mapping dataset names to their corresponding prefixes
dataset_prefix_mapping = {
    'msd-jmir-area-of-moments-all-v1.0': 'AMM_',
    'msd-jmir-lpc-all-v1.0': 'LPC_',
    'msd-jmir-methods-of-moments-all-v1.0': 'MoM_',
    'msd-jmir-mfcc-all-v1.0': 'MFCC_',
    'msd-jmir-spectral-all-all-v1.0': 'Spectral_All_',
    'msd-jmir-spectral-derivatives-all-all-v1.0': 'Spectral_Deri_',
    'msd-marsyas-timbral-v1.0': 'timbral_',
    # The following datasets do not require renaming
    'msd-mvd-v1.0': None,
    'msd-rh-v1.0': None,
    'msd-rp-v1.0': None,
    'msd-ssd-v1.0': None,
    'msd-trh-v1.0': None,
    'msd-tssd-v1.0': None,
}

# Loop through each dataset name
for dataset_name in dataset_names:
    # Set paths dynamically for the attribute and feature files
    attribute_path = f'hdfs:///data/msd/audio/attributes/{dataset_name}.attributes.csv'
    feature_path = f'hdfs:///data/msd/audio/features/{dataset_name}.csv'
    
    # Load attribute data to create the schema
    attribute_data = (spark.read.format("csv")
                      .option("header", False)
                      .load(attribute_path)
                      .toDF("name", "type"))
    
    # Create schema using the attribute data (assuming generate_schema is defined elsewhere)
    feature_schema = generate_schema(attribute_data)
    
    # Load feature data using the generated schema
    feature_data = (spark.read.format("csv")
                    .option("header", False)
                    .option("inferSchema", False)
                    .schema(feature_schema)
                    .load(feature_path))
    
    # Rename columns based on the mapping
    prefix = dataset_prefix_mapping.get(dataset_name, None)
    if prefix:
        # Get the old column names
        old_columns = feature_data.columns
        new_columns = []

        # Iterate over the columns and apply renaming logic, skipping any column that starts with 'MSD'
        for i, old_col in enumerate(old_columns):
            if old_col.startswith('MSD') or old_col.startswith('track_id'):
                new_columns.append(old_col)  # Keep MSD columns unchanged
            else:
                if i < 10:
                    # First 10 columns: std pattern
                    new_columns.append(f"{prefix}std_{i + 1}")
                elif i < 20:
                    # Next 10 columns: avg pattern
                    new_columns.append(f"{prefix}avg_{i - 9}")
                else:
                    # Remaining columns (if more than 20): sequential pattern
                    new_columns.append(f"{prefix}{i + 1}")
        
        # Rename the columns using withColumnRenamed
        for old_col, new_col in zip(old_columns, new_columns):
            feature_data = feature_data.withColumnRenamed(old_col, new_col)
    
    # Show the result for each dataset
    print(f"Feature data corresponding to dataset name: {dataset_name}:")
    # feature_data.printSchema()
    show_as_html(feature_data, 5)

Feature data corresponding to dataset name: msd-jmir-area-of-moments-all-v1.0:


Unnamed: 0,AMM_std_1,AMM_std_2,AMM_std_3,AMM_std_4,AMM_std_5,AMM_std_6,AMM_std_7,AMM_std_8,AMM_std_9,AMM_std_10,...,AMM_avg_2,AMM_avg_3,AMM_avg_4,AMM_avg_5,AMM_avg_6,AMM_avg_7,AMM_avg_8,AMM_avg_9,AMM_avg_10,MSD_TRACKID
0,1.2,3355.0,26270.0,39850000.0,309600000.0,2403000000.0,874900000000.0,2775000000.0,21550000000.0,406400000000000.0,...,5746.0,43470.0,-44220000.0,-337600000.0,-2576000000.0,766500000000.0,3015000000.0,23020000000.0,346000000000000.0,'TRHFHQZ12903C9E2D5'
1,0.9295,6720.0,44100.0,160800000.0,1060000000.0,6985000000.0,7095000000000.0,9545000000.0,62930000000.0,2037000000000000.0,...,11580.0,74040.0,-179200000.0,-1153000000.0,-7420000000.0,6242000000000.0,10370000000.0,66800000000.0,1694000000000000.0,'TRHFHYX12903CAF953'
2,1.883,6712.0,49060.0,160600000.0,1176000000.0,8609000000.0,7083000000000.0,10580000000.0,77440000000.0,2781000000000000.0,...,11580.0,85200.0,-179100000.0,-1316000000.0,-9660000000.0,6233000000000.0,11820000000.0,86800000000.0,2463000000000000.0,'TRHFHAU128F9341A0E'
3,1.884,6722.0,56130.0,161000000.0,1346000000.0,11270000000.0,7112000000000.0,12110000000.0,101400000000.0,4193000000000000.0,...,11600.0,93320.0,-179700000.0,-1459000000.0,-11850000000.0,6262000000000.0,13110000000.0,106600000000.0,3432000000000000.0,'TRHFHLP128F14947A7'
4,1.52,6709.0,53230.0,160500000.0,1295000000.0,10450000000.0,7076000000000.0,11640000000.0,93920000000.0,3751000000000000.0,...,11580.0,93650.0,-179000000.0,-1441000000.0,-11590000000.0,6230000000000.0,12930000000.0,104100000000.0,3248000000000000.0,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-lpc-all-v1.0:


Unnamed: 0,LPC_std_1,LPC_std_2,LPC_std_3,LPC_std_4,LPC_std_5,LPC_std_6,LPC_std_7,LPC_std_8,LPC_std_9,LPC_std_10,...,LPC_avg_2,LPC_avg_3,LPC_avg_4,LPC_avg_5,LPC_avg_6,LPC_avg_7,LPC_avg_8,LPC_avg_9,LPC_avg_10,MSD_TRACKID
0,0.04652,0.1125,0.1386,0.1829,0.09227,0.1261,0.06775,0.1168,0.08311,0.0,...,0.6449,-0.03497,-0.178,-0.2779,0.1096,-0.1404,0.02397,-0.1252,0.0,'TRMMMYQ128F932D901'
1,0.05153,0.1267,0.163,0.1528,0.1127,0.1794,0.07344,0.08478,0.06295,0.0,...,0.7336,0.1151,-0.2112,-0.2369,0.1443,-0.0817,-0.0135,-0.1316,0.0,'TRMMMKD128F425225D'
2,0.04112,0.1182,0.1352,0.09433,0.1448,0.1185,0.1071,0.08826,0.1259,0.0,...,0.3939,-0.2454,-0.08293,-0.32,0.1167,-0.08704,-0.1001,-0.1458,0.0,'TRMMMRX128F93187D9'
3,0.08796,0.1409,0.1781,0.1308,0.1569,0.1263,0.1265,0.1007,0.105,0.0,...,0.459,-0.2506,0.01747,-0.2836,0.2195,-0.05511,-0.02631,-0.09313,0.0,'TRMMMCH128F425532C'
4,0.02573,0.1512,0.1432,0.1431,0.1065,0.1196,0.07276,0.08836,0.05992,0.0,...,0.6478,0.09408,0.06476,-0.01578,0.06642,0.004455,-0.01168,-0.03808,0.0,'TRMMMWA128F426B589'


Feature data corresponding to dataset name: msd-jmir-methods-of-moments-all-v1.0:


Unnamed: 0,MoM_std_1,MoM_std_2,MoM_std_3,MoM_std_4,MoM_std_5,MoM_std_6,MoM_std_7,MoM_std_8,MoM_std_9,MoM_std_10,MSD_TRACKID
0,0.1545,13.11,840.0,41080.0,7108000.0,0.319,33.41,1371.0,64240.0,8398000.0,'TRHFHQZ12903C9E2D5'
1,0.1195,13.02,611.9,43880.0,7226000.0,0.2661,30.26,1829.0,183800.0,31230000.0,'TRHFHYX12903CAF953'
2,0.2326,7.185,362.2,19890.0,3030000.0,0.8854,32.68,1384.0,79190.0,9862000.0,'TRHFHAU128F9341A0E'
3,0.2283,10.3,463.8,24730.0,3336000.0,0.4321,37.56,2047.0,197200.0,32930000.0,'TRHFHLP128F14947A7'
4,0.1841,8.544,359.4,21900.0,3359000.0,0.8438,36.36,2008.0,205400.0,35390000.0,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-mfcc-all-v1.0:


Unnamed: 0,MFCC_std_1,MFCC_std_2,MFCC_std_3,MFCC_std_4,MFCC_std_5,MFCC_std_6,MFCC_std_7,MFCC_std_8,MFCC_std_9,MFCC_std_10,...,MFCC_avg_8,MFCC_avg_9,MFCC_avg_10,MFCC_21,MFCC_22,MFCC_23,MFCC_24,MFCC_25,MFCC_26,MSD_TRACKID
0,59.28,4.15,5.105,2.275,2.185,1.95,1.603,1.335,1.3,1.396,...,1.033,-0.3445,-0.4305,0.1372,0.0312,-0.3132,-0.0815,0.7213,-0.405,'TRHFHQZ12903C9E2D5'
1,46.35,6.888,4.652,4.131,3.225,2.826,2.108,1.922,1.95,1.697,...,-0.2149,3.74,0.2908,1.92,0.125,1.242,-0.5187,0.2548,-0.4073,'TRHFHYX12903CAF953'
2,38.63,3.041,2.504,2.141,1.853,1.906,1.867,1.547,1.491,1.468,...,1.035,1.283,1.408,0.5908,0.4882,0.5522,-0.3168,-0.5887,0.03743,'TRHFHAU128F9341A0E'
3,33.49,5.009,4.56,3.153,2.383,2.4,2.113,1.985,2.075,1.827,...,-1.351,2.264,-0.2145,0.3408,-0.604,1.261,-1.527,0.02701,-0.7334,'TRHFHLP128F14947A7'
4,37.43,4.107,3.167,2.793,2.158,1.926,1.814,1.598,1.622,1.547,...,-1.063,1.167,0.0559,1.682,-0.6607,1.038,-0.1167,0.462,-0.3687,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-spectral-all-all-v1.0:


Unnamed: 0,Spectral_All_std_1,Spectral_All_std_2,Spectral_All_std_3,Spectral_All_std_4,Spectral_All_std_5,Spectral_All_std_6,Spectral_All_std_7,Spectral_All_std_8,Spectral_All_std_9,Spectral_All_std_10,Spectral_All_avg_1,Spectral_All_avg_2,Spectral_All_avg_3,Spectral_All_avg_4,Spectral_All_avg_5,Spectral_All_avg_6,MSD_TRACKID
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,'TRHFHQZ12903C9E2D5'
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,'TRHFHYX12903CAF953'
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,'TRHFHAU128F9341A0E'
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,'TRHFHLP128F14947A7'
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-jmir-spectral-derivatives-all-all-v1.0:


Unnamed: 0,Spectral_Deri_std_1,Spectral_Deri_std_2,Spectral_Deri_std_3,Spectral_Deri_std_4,Spectral_Deri_std_5,Spectral_Deri_std_6,Spectral_Deri_std_7,Spectral_Deri_std_8,Spectral_Deri_std_9,Spectral_Deri_std_10,Spectral_Deri_avg_1,Spectral_Deri_avg_2,Spectral_Deri_avg_3,Spectral_Deri_avg_4,Spectral_Deri_avg_5,Spectral_Deri_avg_6,MSD_TRACKID
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,'TRHFHQZ12903C9E2D5'
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,'TRHFHYX12903CAF953'
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,'TRHFHAU128F9341A0E'
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,'TRHFHLP128F14947A7'
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,'TRHFHFF128F930AC11'


Feature data corresponding to dataset name: msd-marsyas-timbral-v1.0:


Unnamed: 0,timbral_std_1,timbral_std_2,timbral_std_3,timbral_std_4,timbral_std_5,timbral_std_6,timbral_std_7,timbral_std_8,timbral_std_9,timbral_std_10,...,timbral_116,timbral_117,timbral_118,timbral_119,timbral_120,timbral_121,timbral_122,timbral_123,timbral_124,track_id
0,0.112178,0.088561,0.210064,0.094495,-43.977376,3.548018,0.346619,0.580689,-0.226928,0.21863,...,0.000976,0.001009,0.001089,0.001077,0.001037,0.000982,0.001076,0.277586,20.106226,TRSUFWB128F4255BAE
1,0.133675,0.100968,0.229477,0.087755,-42.694424,3.856577,-0.623697,0.510768,0.157851,-0.191616,...,0.00049,0.00048,0.000506,0.000496,0.000461,0.000448,0.000425,0.256402,2.877227,TRSUFSW128F4284B04
2,0.198948,0.17707,0.292963,0.087348,-38.52392,0.690861,-2.264458,1.314971,0.207455,0.143965,...,0.000351,0.000282,0.000288,0.000292,0.000281,0.000289,0.000375,0.080737,0.524488,TRSUFUP128F42561A2
3,0.086491,0.061144,0.132627,0.094506,-43.728943,3.964348,0.94362,1.687271,0.44188,0.541842,...,0.001156,0.001039,0.000951,0.000896,0.000822,0.00077,0.000847,0.277738,7.509386,TRSUFWL128F42956C9
4,0.17056,0.206775,0.451917,0.108458,-62.959637,1.115836,0.030579,-0.110063,-0.17779,0.222752,...,0.001632,0.001724,0.001587,0.001472,0.001094,0.001588,0.001523,0.51268,80.313907,TRSUFQR12903CD493C


Feature data corresponding to dataset name: msd-mvd-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_411,component_412,component_413,component_414,component_415,component_416,component_417,component_418,component_419,instanceName
0,0.226884,0.230364,0.219057,0.175083,0.169087,0.168407,0.138525,0.124724,0.152904,0.123112,...,0.146509,0.147567,0.121105,0.132596,0.144324,0.116596,0.168728,0.157645,0.110822,'TRYWDAH128F92D4539'
1,0.214838,0.181556,0.107927,0.110481,0.120849,0.104712,0.086065,0.068917,0.091953,0.073461,...,0.080255,0.087641,0.098576,0.07729,0.089487,0.068734,0.103242,0.091347,0.073943,'TRJVUJL128C71968F1'
2,0.599207,0.462929,0.334623,0.235651,0.180505,0.192023,0.162616,0.147555,0.147512,0.150728,...,0.131198,0.137329,0.117605,0.111755,0.113637,0.101259,0.091637,0.107954,0.094497,'TRHNLNG128F42717FF'
3,0.161975,0.131392,0.147728,0.146141,0.12519,0.131663,0.127299,0.124623,0.098403,0.16116,...,0.145621,0.153424,0.137743,0.09089,0.105781,0.121976,0.122234,0.114681,0.122273,'TRIDGZT128F428B9F5'
4,0.511186,0.445876,0.395141,0.328717,0.297649,0.20564,0.260945,0.189948,0.317763,0.215753,...,0.172113,0.161366,0.15987,0.17663,0.197752,0.165474,0.149154,0.173748,0.25931,'TRCNVJH128F427213A'


Feature data corresponding to dataset name: msd-rh-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_51,component_52,component_53,component_54,component_55,component_56,component_57,component_58,component_59,instanceName
0,5.965746,5.633224,5.295259,4.188206,4.023376,3.51561,3.344197,3.3399,3.610377,3.100263,...,1.176926,1.383768,1.196357,1.123239,1.292873,1.178216,1.002451,1.203526,0.907604,'TRYWDAH128F92D4539'
1,4.853912,3.913811,2.286072,2.583405,2.823458,2.354543,2.086963,1.679639,2.226083,1.931475,...,0.695435,0.759757,0.835221,0.742432,0.689854,0.628478,0.72213,0.666933,0.678972,'TRJVUJL128C71968F1'
2,3.923767,2.956465,3.671716,3.08399,2.879255,2.786314,3.120237,2.632083,2.083783,4.158884,...,1.129439,1.191363,1.285582,0.884632,0.889719,1.008565,1.051691,0.9444,1.30365,'TRIDGZT128F428B9F5'
3,13.195897,10.883453,8.830368,6.359973,4.522331,4.36811,3.778535,3.662791,3.603957,3.935505,...,0.945969,0.957197,0.878725,0.957463,0.968238,0.817337,0.753092,0.860108,0.817529,'TRHNLNG128F42717FF'
4,13.785449,10.22963,8.138473,7.748904,7.104238,4.991848,6.260711,4.912233,7.58917,4.993329,...,1.90136,1.805084,1.58057,1.646917,2.346912,1.52814,1.835398,1.661893,2.697921,'TRCNVJH128F427213A'


Feature data corresponding to dataset name: msd-rp-v1.0:


Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_1432,component_1433,component_1434,component_1435,component_1436,component_1437,component_1438,component_1439,component_1440,instanceName
0,0.002736,0.003725,0.007415,0.002806,0.008469,0.007994,0.014948,0.008144,0.01197,0.013622,...,0.026226,0.017471,0.004238,0.009438,0.004225,0.005165,0.00521,0.003043,1e-06,TRYWDAH128F92D4539
1,0.001376,0.012098,0.004903,0.009153,0.009236,0.005353,0.003515,0.005018,0.006722,0.006616,...,0.007779,0.013003,0.012642,0.009386,0.005285,0.003885,0.002716,0.000712,1e-06,TRJVUJL128C71968F1
2,0.006036,0.006173,0.00735,0.008331,0.007874,0.008664,0.007132,0.008543,0.006113,0.007539,...,0.027203,0.022541,0.016961,0.027418,0.021692,0.017372,0.019103,0.011734,1e-06,TRIDGZT128F428B9F5
3,0.018246,0.008646,0.021885,0.041223,0.004092,0.080736,0.041131,0.051823,0.020586,0.033808,...,0.009745,0.006665,0.01078,0.003438,0.003464,0.003668,0.004031,0.001271,1e-06,TRHNLNG128F42717FF
4,0.01251,0.018594,0.027447,0.036388,0.023727,0.038141,0.031223,0.038639,0.025802,0.022678,...,0.060113,0.062213,0.075493,0.061272,0.025676,0.020332,0.020932,0.011308,0.002284,TRCNVJH128F427213A


Feature data corresponding to dataset name: msd-ssd-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_159,component_160,component_161,component_162,component_163,component_164,component_165,component_166,component_167,instanceName
0,3.340253,3.184129,3.467989,3.961744,3.802365,4.048197,3.617043,3.335274,3.111133,2.560841,...,9.814441,12.419846,9.361515,7.508801,6.970781,4.840366,4.327801,2.079462,0.001066,'TRJAPJV128F147E501'
1,3.859655,3.169665,2.339668,2.379815,2.087785,3.719538,3.033265,2.719684,2.258551,2.830153,...,9.991244,7.845734,8.7121,4.766188,3.607995,3.487868,2.994789,1.304786,0.001066,'TRKXUVI12903CF164C'
2,2.366181,3.693143,3.689352,3.922064,3.597496,3.675475,3.36607,3.14971,2.901223,2.544667,...,12.181783,9.146891,7.610539,6.840762,5.609022,4.445043,3.850136,2.175525,0.412306,'TRVSMHB128F933868B'
3,2.049054,4.118874,3.839635,3.426112,2.437866,2.808449,2.486959,2.366762,2.135613,1.633703,...,6.9143,7.312521,6.697511,6.286495,5.046462,3.163909,2.540429,1.371564,0.001066,'TRIIFIJ128F425D096'
4,4.407492,4.824841,4.080729,4.589847,3.908415,3.944584,4.569623,4.178122,4.202005,3.411257,...,18.883988,18.684402,12.268226,9.667586,7.543367,9.391542,11.664551,6.619618,1.100809,'TRKKTRL128F933DE30'


Feature data corresponding to dataset name: msd-trh-v1.0:


Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,component_411,component_412,component_413,component_414,component_415,component_416,component_417,component_418,component_419,instanceName
0,7.663735,6.981061,8.467507,6.571336,5.63396,8.180509,4.314876,4.100904,5.979875,3.670057,...,2.095989,1.726187,2.30842,2.044628,1.671321,2.294742,1.787176,1.507227,2.970258,'TRBRYCM128F42681FE'
1,17.145029,13.643089,10.666042,8.857338,8.446842,10.066524,6.72611,6.230044,6.250445,4.958765,...,1.68291,1.727391,1.796884,1.742186,1.518526,1.72243,1.79298,1.530533,1.903642,'TRSZPGP128F1456CE3'
2,14.535889,11.364023,8.664688,9.72017,8.196153,5.545929,6.525569,5.897498,12.678313,5.139655,...,3.55384,3.645958,5.870902,4.13836,3.004171,2.786075,3.196151,2.980204,2.81041,'TRUHNXZ128F1496B8F'
3,6.608727,5.889982,5.242445,4.54118,5.458274,3.798462,4.941622,5.36169,3.752878,3.690395,...,4.811908,3.375773,3.118121,2.358886,4.413413,2.764471,5.206478,2.746375,3.467102,'TRQODPB128F934C699'
4,9.719172,11.088511,18.8139,6.392022,7.077559,7.246826,4.931072,6.396743,6.104538,5.80938,...,2.832877,3.334148,3.4537,2.614119,2.446143,2.813223,3.107177,2.926267,2.421697,'TRMNUKS128F4230ED5'


Feature data corresponding to dataset name: msd-tssd-v1.0:


Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_1168,component_1169,component_1170,component_1171,component_1172,component_1173,component_1174,component_1175,component_1176,instanceName
0,3.340253,3.184129,3.467989,3.961744,3.802365,4.048197,3.617043,3.335274,3.111133,2.560841,...,11.328043,17.813001,11.465108,8.98148,7.833965,5.010287,5.28922,2.32615,0.001066,TRJAPJV128F147E501
1,3.859655,3.169665,2.339668,2.379815,2.087785,3.719538,3.033265,2.719684,2.258551,2.830153,...,13.345222,9.088343,8.852256,4.873721,3.655026,3.641957,3.029004,1.315704,0.001066,TRKXUVI12903CF164C
2,2.366181,3.693143,3.689352,3.922064,3.597496,3.675475,3.36607,3.14971,2.901223,2.544667,...,14.520047,10.333855,8.751075,9.64756,7.675982,5.67275,5.248331,2.370116,0.444731,TRVSMHB128F933868B
3,2.049054,4.118874,3.839635,3.426112,2.437866,2.808449,2.486959,2.366762,2.135613,1.633703,...,8.493671,8.592187,8.018544,9.971325,6.184466,4.542195,4.012239,2.22955,0.001066,TRIIFIJ128F425D096
4,4.407492,4.824841,4.080729,4.589847,3.908415,3.944584,4.569623,4.178122,4.202005,3.411257,...,22.558385,19.272145,13.098851,11.940146,8.060847,10.481845,15.760706,7.519517,1.205712,TRKKTRL128F933DE30


In [25]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()