In [1]:
# Initializing Spark
import findspark
findspark.init()

# Importing PySpark related
from pyspark import SparkContext
from pyspark.sql import SparkSession
import collections

In [2]:
# Importing other packages
import numpy as np
import librosa
import librosa.display
from pydub import AudioSegment
import io
from IPython import display
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [8]:
# Creating Spark Session
spark = SparkSession.builder \
    .master('local') \
    .appName('AudioAttempt') \
    .config('spark.executor.memory', '2gb') \
    .getOrCreate()
#     .config('spark.jars', 'jars/hadoop-azure-3.2.1.jar,jars/azure-storage-8.6.0.jar' ) \

In [9]:
# Linking to Azure storage blob
# spark.conf.set(
#     "fs.azure.account.key.transcribestorage.blob.core.windows.net",
#     "bDLZf2LvJaveFZ6qJV+6g/72Uas2jswaM2D/h7fGWb/NU9eldDr98ffiHJns0xVpm5Ox5g7KqBIBJuZ2e5Vn0Q=="
# )
sc = spark.sparkContext

spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure.account.key.transcribestorage.blob.core.windows.net", "bDLZf2LvJaveFZ6qJV+6g/72Uas2jswaM2D/h7fGWb/NU9eldDr98ffiHJns0xVpm5Ox5g7KqBIBJuZ2e5Vn0Q==")

In [10]:
# Creating Spark Context
context = spark.sparkContext

In [6]:
# wasb://speechdata@transcribestorage.blob.core.windows.net/

In [7]:
context.binaryFiles('wasbs://speechdata@transcribestorage.blob.core.windows.net')

Py4JJavaError: An error occurred while calling o17.binaryFiles.
: java.io.IOException: No FileSystem for scheme: wasbs
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:500)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:469)
	at org.apache.spark.SparkContext$$anonfun$binaryFiles$1.apply(SparkContext.scala:927)
	at org.apache.spark.SparkContext$$anonfun$binaryFiles$1.apply(SparkContext.scala:922)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.SparkContext.withScope(SparkContext.scala:699)
	at org.apache.spark.SparkContext.binaryFiles(SparkContext.scala:922)
	at org.apache.spark.api.java.JavaSparkContext.binaryFiles(JavaSparkContext.scala:258)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [11]:
# Function to turn binary audio into numerical data
def getTrack(binary):
    return np.array(AudioSegment.from_file(io.BytesIO(binary), format = "mp3", frame_rate = 44100).get_array_of_samples()).astype(np.float32)

In [12]:
# Function to apply Fourier transformation to numerical audio data
def getBands(track):
    return [np.abs(librosa.stft(track[i*960:(i+1)*960], hop_length = 961)) for i in range(int(len(track)/960))]

In [13]:
# Attempting to read audio data
binaryAudio = context.binaryFiles('data/testing/sample-000000.mp3')

In [14]:
# https://stackoverflow.com/questions/43941716/how-to-play-mp3-from-bytes
filteredAudio = binaryAudio.map(lambda x: x[1])

In [15]:
numericalAudio = filteredAudio.map(lambda x: getTrack(x))

In [16]:
fourierAudio = numericalAudio.map(lambda x: getBands(x))

In [17]:
converted = numericalAudio.collect()
transformed = fourierAudio.collect()[0]

In [36]:
np.array(transformed).flatten().shape

(210125,)

In [18]:
librosa.get_duration(y = converted, sr = 44100)

4.466938775510204

In [None]:
display.Audio(converted, rate = 44100)

In [None]:
librosa.display.specshow(transformed[3], y_axis = 'log', cmap='Spectral')

In [None]:
# Find longest audio file
import os

maxLen = 0
maxFile = ''
baseDir = 'data/cv-valid-train/cv-valid-train/'

for fileName in tqdm(os.listdir(baseDir)):
    fileLen = librosa.get_duration(filename=os.path.join(baseDir, fileName), sr=44100)
    if fileLen > maxLen:
        maxLen = fileLen
        maxFile = fileName
        
# Try locally, on azure and on WU server and compare runtime in final report

In [None]:
maxLen, maxFile