In [0]:
# PART 1 - SET UP CONNECTIONS

In [0]:
# packages used
import datetime
import math
from multiprocessing.pool import ThreadPool
import numpy as np
import os
import pandas as pd
from pyspark.sql import functions as F
import h5py # first install on cluster
import string # first install on cluster
import s3fs # first install on cluster
import threading

In [0]:
# S3 CONFIGURATIONS
# read s3 bucket directly from databricks cluster: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html
access_key = '<access_key>'
secret_key = '<secret_key>'
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)
aws_bucket_name = "bucket-msd-subset"
path_to_bucket = 's3a://{}/'.format(aws_bucket_name)

In [0]:
# mounting the folder with sqlite DBs onto DBFS because I wasn't able to access directly
# folder_to_mount = path_to_bucket + '/AdditionalFiles'
# try:
#   dbutils.fs.mount(folder_to_mount, '/mnt/temp/')
# except:
#   print('Folder may already be mounted.')
# display(dbutils.fs.ls('/mnt/temp/'))
# dbutils.fs.unmount('/mnt/temp/')

In [0]:
# AZURE SQL DATABASE CONFIGURATIONS
# https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/sql-databases#python-example
jdbcHostname = 'azure-sql-server-msd.database.windows.net'
jdbcPort = 1433
jdbcDatabase = 'msd-subset'
jdbcUsername = 'derekfunk'
jdbcPassword = '<password>'

# unable to connect to just the server level and create databases from there, only have been able to connect directly to pre-existing databases

jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
  "user" : jdbcUsername,
  "password" : jdbcPassword,
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

# pushdown_query = "(SELECT * FROM dbo.Persons) test"
# df = spark.read.jdbc(url=jdbcUrl, table=pushdown_query, properties=connectionProperties)
# display(df)

In [0]:
# PART 2 - CREATE DATAFRAMES FROM FOLDER Additional Files (non-track data)

In [0]:
# 1 - msd_artist
# skipping lat/long b/c join is complicated
path_to_file_msd_artist = path_to_bucket + 'AdditionalFiles/unique_artists.txt'
df_msd_artist = spark.read.format('csv') \
  .option('delimiter', '<SEP>') \
  .load(path_to_file_msd_artist) \
  .select(F.col('_c0').alias('artist_id'), F.col('_c1').alias('artist_mbid'), F.col('_c3').alias('artist_name'))

In [0]:
# 2 - msd_artist_similarity
# skipping since sqlite

In [0]:
# 3 - msd_r_term
path_to_file_msd_r_term = path_to_bucket + 'AdditionalFiles/unique_terms.txt'
df_msd_r_term = spark.read.text(path_to_file_msd_r_term) \
  .withColumn('id', F.monotonically_increasing_id() + 1) \
  .select('id', 'value')

In [0]:
# 4 - msd_r_mbtag
bad_values = ['1 13 165900 150 7672 22647 34612 48720 59280 74602 87545 95495 107182 131087 141522 153710',
'1 7 186240 183 23558 41608 89158 111733 150833 169883',
              'ਪੰਜਾਬੀ',
              'ਭੰਗੜਾ',
              '香港歌手'
             ]

path_to_file_msd_r_mbtag = path_to_bucket + 'AdditionalFiles/unique_mbtags.txt'
df_msd_r_mbtag = spark.read.text(path_to_file_msd_r_mbtag) \
  .where(~(F.col('value').isin(bad_values))) \
  .withColumn('mbtag_id', F.monotonically_increasing_id() + 1) \
  .select('mbtag_id', F.col('value').alias('mbtag_name'))

In [0]:
# 5 - msd_artist_term
# skipping since sqlite

In [0]:
# 6 - msd_artist_mbtag
# skipping since sqlite

In [0]:
# write all non-track tables
tables_non_track = {
  'msd_artist': df_msd_artist,
  'msd_r_term': df_msd_r_term,
  'msd_r_mbtag': df_msd_r_mbtag
}
for table_name in tables_non_track.keys():
  tables_non_track[table_name].write \
    .format('jdbc') \
    .option('url', jdbcUrl) \
    .option('dbtable', table_name) \
    .option('user', jdbcUsername) \
    .option('password', jdbcPassword) \
    .save()

In [0]:
# PART 3 - CREATE DATAFRAMES FROM FOLDER data (track data)
# run this in DB to start over: drop table if exists msd_artist, msd_r_mbtag, msd_r_term, msd_track, msd_bar, msd_beat, msd_section, msd_tatum, msd_segment
# run these to clear data but keep structure:
# delete from msd_track

In [0]:
# create all track tables without data
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType

schema_msd_track = StructType([
  StructField('track_id', StringType()),
  StructField('analysis_sample_rate', IntegerType()),
  StructField('audio_md5', StringType()),
  StructField('danceability', FloatType()),
  StructField('duration', FloatType()),
  StructField('end_of_fade_in', FloatType()),
  StructField('energy', FloatType()),
  StructField('key', IntegerType()),
  StructField('key_confidence', FloatType()),
  StructField('loudness', FloatType()),
  StructField('mode', IntegerType()),
  StructField('mode_confidence', FloatType()),
  StructField('start_of_fade_out', FloatType()),
  StructField('tempo', FloatType()),
  StructField('time_signature', IntegerType()),
  StructField('time_signature_confidence', FloatType()),
  StructField('artist_id', StringType()),
  StructField('release', StringType()),
  StructField('song_hotness', FloatType()),
  StructField('title', StringType()),
  StructField('track_7digitalid', StringType()),
  StructField('year', IntegerType())
#   ,
#   StructField('upload_timestamp', TimestampType())
])

schema_msd_bar = StructType([
  StructField('bar_id', IntegerType()),
  StructField('track_id', StringType()),
  StructField('track_bar_id', IntegerType()),
  StructField('bar_confidence', FloatType()),
  StructField('bar_start', FloatType())
])

schema_msd_beat = StructType([
  StructField('beat_id', IntegerType()),
  StructField('track_id', StringType()),
  StructField('track_beat_id', IntegerType()),
  StructField('beat_confidence', FloatType()),
  StructField('beat_start', FloatType())
])

schema_msd_section = StructType([
  StructField('section_id', IntegerType()),
  StructField('track_id', StringType()),
  StructField('track_section_id', IntegerType()),
  StructField('section_confidence', FloatType()),
  StructField('section_start', FloatType())
])

schema_msd_tatum = StructType([
  StructField('tatum_id', IntegerType()),
  StructField('track_id', StringType()),
  StructField('track_tatum_id', IntegerType()),
  StructField('tatum_confidence', FloatType()),
  StructField('tatum_start', FloatType())
])

schema_msd_segment = StructType([
  StructField('segment_id', IntegerType()),
  StructField('track_id', StringType()),
  StructField('track_segment_id', IntegerType()),
  StructField('segment_confidence', FloatType()),
  StructField('segment_start', FloatType()),
  StructField('segment_loudness_max', FloatType()),
  StructField('segment_loudness_max_time', FloatType()),
  StructField('segment_loudness_start', FloatType()),
  StructField('p1', FloatType()),
  StructField('p2', FloatType()),
  StructField('p3', FloatType()),
  StructField('p4', FloatType()),
  StructField('p5', FloatType()),
  StructField('p6', FloatType()),
  StructField('p7', FloatType()),
  StructField('p8', FloatType()),
  StructField('p9', FloatType()),
  StructField('p10', FloatType()),
  StructField('p11', FloatType()),
  StructField('p12', FloatType()),
  StructField('t1', FloatType()),
  StructField('t2', FloatType()),
  StructField('t3', FloatType()),
  StructField('t4', FloatType()),
  StructField('t5', FloatType()),
  StructField('t6', FloatType()),
  StructField('t7', FloatType()),
  StructField('t8', FloatType()),
  StructField('t9', FloatType()),
  StructField('t10', FloatType()),
  StructField('t11', FloatType()),
  StructField('t12', FloatType())
])

track_schemas = {
  'msd_track': schema_msd_track,
  'msd_bar': schema_msd_bar,
  'msd_beat': schema_msd_beat,
  'msd_section': schema_msd_section,
  'msd_tatum': schema_msd_tatum,
  'msd_segment': schema_msd_segment
}

for table_name in track_schemas.keys():
  spark.createDataFrame([], track_schemas[table_name]).write \
    .format('jdbc') \
    .option('url', jdbcUrl) \
    .option('dbtable', table_name) \
    .option('user', jdbcUsername) \
    .option('password', jdbcPassword) \
    .save()

In [0]:
# insert track data
alphabet = tuple(string.ascii_uppercase)
folder_group_1 = [f'{path_to_bucket}data/A/{level_2}/{level_3}/' for level_2 in alphabet for level_3 in alphabet]
folder_group_2 = [f'{path_to_bucket}data/B/{level_2}/{level_3}/' for level_2 in alphabet[:8] for level_3 in alphabet]
folder_group_3 = [f'{path_to_bucket}data/B/I/{level_3}/' for level_3 in alphabet[:10]]
folders = folder_group_1 + folder_group_2 + folder_group_3
s3 = s3fs.S3FileSystem(anon=False, key=access_key, secret=secret_key)
spark.conf.set("spark.sql.execution.arrow.enabled", "false")

In [0]:
def write_folder(folder):
  data_msd_track = None
  data_msd_bar = None
  data_msd_beat = None
  data_msd_section = None
  data_msd_tatum = None
  data_msd_segment = None
  
#   for folder in list_of_folders:
  
  current_list_of_files = s3.ls(folder)

  # loop over files in this folder
  for file in current_list_of_files:
  # file = current_list_of_files[0]

    url_to_current_file = 's3a://' + file
    h5 = h5py.File(s3.open(url_to_current_file), 'r')

    # top level
    track_id = h5['analysis/songs'][0][-1]

    # h5['analysis'].keys()
    analysis_sample_rate = np.array(h5['analysis/songs'])[0][0]
    audio_md5 = np.array(h5['analysis/songs'])[0][1]
    if np.array(h5['analysis/songs'])[0][2] == 0:
        danceability = None
    else:
        danceability = np.array(h5['analysis/songs'])[0][2]
    duration = np.array(h5['analysis/songs'])[0][3]
    end_of_fade_in = np.array(h5['analysis/songs'])[0][4]
    if np.array(h5['analysis/songs'])[0][5] == 0:
        energy = None
    else:
        energy = np.array(h5['analysis/songs'])[0][5]
    key = np.array(h5['analysis/songs'])[0][21]
    key_confidence = np.array(h5['analysis/songs'])[0][22]
    loudness = float(np.array(h5['analysis/songs'])[0][23])
    mode = np.array(h5['analysis/songs'])[0][24]
    mode_confidence = np.array(h5['analysis/songs'])[0][25]
    start_of_fade_out = np.array(h5['analysis/songs'])[0][26]
    tempo = np.array(h5['analysis/songs'])[0][27]
    time_signature = np.array(h5['analysis/songs'])[0][28]
    time_signature_confidence = np.array(h5['analysis/songs'])[0][29]

    # h5['metadata'].keys()
    artist_id = np.array(h5['metadata/songs'])[0][4]
    release = np.array(h5['metadata/songs'])[0][14]
    if math.isnan(np.array(h5['metadata/songs'])[0][16]):
        song_hotness = None
    else:
        song_hotness = np.array(h5['metadata/songs'])[0][16]
    title = np.array(h5['metadata/songs'])[0][18]
    track_7digitalid = np.array(h5['metadata/songs'])[0][19]

    # h5['musicbrainz'].keys()
    if np.array(h5['musicbrainz/songs'])[0][1] == 0:
        year = None
    else:
        year = np.array(h5['musicbrainz/songs'])[0][1]

    #   upload_timestamp = datetime.datetime.now().replace(microsecond=0)
    #   upload_timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    #   upload_timestamp = None
    #   upload_timestamp = '2021-01-01 00:00:00'

    data_msd_track_batch = np.column_stack((
                track_id.decode("utf-8"),
                analysis_sample_rate,
                audio_md5.decode("utf-8"),
                danceability,
                duration,
                end_of_fade_in,
                energy,
                key,
                key_confidence,
                loudness,
                mode,
                mode_confidence,
                start_of_fade_out,
                tempo,
                time_signature,
                time_signature_confidence,
                artist_id.decode("utf-8"),
                release.decode("utf-8"),
                song_hotness,
                title.decode("utf-8"),
                track_7digitalid,
                year
    #     ,
    #               upload_timestamp
            ))

    if data_msd_track is None:
        data_msd_track = data_msd_track_batch
    else:
        data_msd_track = np.row_stack((
            data_msd_track,
            data_msd_track_batch
        ))

    #   bar
    bars_confidence = np.array(h5['analysis/bars_confidence'])
    no_bars = len(bars_confidence)
    bars_start = np.array(h5['analysis/bars_start'])
    data_msd_bar_batch = np.column_stack((
        np.full((no_bars), None),
        np.full((no_bars), track_id.decode("utf-8")),
        np.array(range(1, no_bars + 1)),
        bars_confidence,
        bars_start
    ))
    if data_msd_bar is None:
        data_msd_bar = data_msd_bar_batch
    else:
        data_msd_bar = np.row_stack((
            data_msd_bar,
            data_msd_bar_batch
        ))

    # beat
    beats_confidence = np.array(h5['analysis/beats_confidence'])
    no_beats = len(beats_confidence)
    beats_start = np.array(h5['analysis/beats_start'])
    data_msd_beat_batch = np.column_stack((
        np.full((no_beats), None),
        np.full((no_beats), track_id.decode("utf-8")),
        np.array(range(1, no_beats + 1)),
        beats_confidence,
        beats_start
    ))
    if data_msd_beat is None:
        data_msd_beat = data_msd_beat_batch
    else:
        data_msd_beat = np.row_stack((
            data_msd_beat,
            data_msd_beat_batch
        ))

    # section
    sections_confidence = np.array(h5['analysis/sections_confidence'])
    no_sections = len(sections_confidence)
    sections_start = np.array(h5['analysis/sections_start'])
    data_msd_section_batch = np.column_stack((
        np.full((no_sections), None),
        np.full((no_sections), track_id.decode("utf-8")),
        np.array(range(1, no_sections + 1)),
        sections_confidence,
        sections_start
    ))
    if data_msd_section is None:
        data_msd_section = data_msd_section_batch
    else:
        data_msd_section = np.row_stack((
            data_msd_section,
            data_msd_section_batch
        ))

    # tatum
    tatums_confidence = np.array(h5['analysis/tatums_confidence'])
    no_tatums = len(tatums_confidence)
    tatums_start = np.array(h5['analysis/tatums_start'])
    data_msd_tatum_batch = np.column_stack((
        np.full((no_tatums), None),
        np.full((no_tatums), track_id.decode("utf-8")),
        np.array(range(1, no_tatums + 1)),
        tatums_confidence,
        tatums_start
    ))
    if data_msd_tatum is None:
        data_msd_tatum = data_msd_tatum_batch
    else:
        data_msd_tatum = np.row_stack((
            data_msd_tatum,
            data_msd_tatum_batch
        ))

    # segment
    segments_confidence = np.array(h5['analysis/segments_confidence'])
    no_segments = len(segments_confidence)
    segments_start = np.array(h5['analysis/segments_start'])
    segments_loudness_max = np.array(h5['analysis/segments_loudness_max'])
    segments_loudness_max_time = np.array(h5['analysis/segments_loudness_max_time'])
    segments_loudness_start = np.array(h5['analysis/segments_loudness_start'])
    segments_pitch = np.array(h5['analysis/segments_pitches'])
    segments_timbre = np.array(h5['analysis/segments_timbre'])

    data_msd_segment_batch = np.column_stack((
        np.full((no_segments), None),
        np.full((no_segments), track_id.decode("utf-8")),
        np.array(range(1, no_segments + 1)),
        segments_confidence,
        segments_start,
        segments_loudness_max,
        segments_loudness_max_time,
        segments_loudness_start,
        segments_pitch,
        segments_timbre
    ))
    if data_msd_segment is None:
        data_msd_segment = data_msd_segment_batch
    else:
        data_msd_segment = np.row_stack((
            data_msd_segment,
            data_msd_segment_batch
        ))
          
  spark.createDataFrame(pd.DataFrame(data_msd_track), schema=schema_msd_track).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_track') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save()    
  spark.createDataFrame(pd.DataFrame(data_msd_bar), schema=schema_msd_bar).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_bar') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save() 
  spark.createDataFrame(pd.DataFrame(data_msd_beat), schema=schema_msd_beat).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_beat') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save() 
  spark.createDataFrame(pd.DataFrame(data_msd_section), schema=schema_msd_section).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_section') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save() 
  spark.createDataFrame(pd.DataFrame(data_msd_tatum), schema=schema_msd_tatum).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_tatum') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save() 
  spark.createDataFrame(pd.DataFrame(data_msd_segment), schema=schema_msd_segment).write \
        .mode('append') \
        .format('jdbc') \
        .option('url', jdbcUrl) \
        .option('dbtable', 'msd_segment') \
        .option('user', jdbcUsername) \
        .option('password', jdbcPassword) \
        .save()

In [0]:
from multiprocessing.pool import ThreadPool
pool = ThreadPool(1000)
pool.map(lambda x: write_folder(x), folders)