In [1]:
# Initializing Spark
import findspark
findspark.init()

# Importing PySpark related
from pyspark import SparkContext
from pyspark.sql import SparkSession
import collections

In [2]:
# Importing TensorFlow
import tensorflow as tf
import tensorflowonspark as tfos
from tensorflowonspark import TFCluster # Needs package 'packaging' to be installed manually to run properly

from tensorflow.keras import layers, models
import tensorflow.keras as keras

In [3]:
# Importing other packages
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython import display

# Hiding pysoundfile warning generated by librosa
import warnings
warnings.filterwarnings("ignore", message="pysoundfile") 

In [None]:
# Testing loading of audio data
data, sampleRate = librosa.load('./data/cv-valid-train/cv-valid-train/sample-000001.mp3', sr=16000)

In [None]:
display.Audio(data, rate=16000)

In [4]:
text = pd.read_csv('./data/cv-valid-train.csv')

In [5]:
textOnly = pd.DataFrame(text.reindex(text.text.str.len().sort_values().index)).reset_index().drop(['index'], axis=1)

In [6]:
firstTest = textOnly.head(100)

In [None]:
firstTest

In [11]:
# Function to load audio data
def loadAudio(filename):
    data, _ = librosa.load(f'./data/cv-valid-train/{filename}', sr=16000)
    bands = getBands(data)
    padded = padAudio(bands, 1000)
    return padded.reshape(1, 1000, 1025)

In [18]:
X = []
for i in tqdm(firstTest.iterrows(), total=firstTest.shape[0]):
    X.append(loadAudio(i[1]['filename']))

HBox(children=(IntProgress(value=0), HTML(value='')))




In [19]:
X = np.array(X)

In [20]:
textAsList = list(firstTest['text'])
for index, txt in enumerate(textAsList):
    textAsList[index] = list(txt.upper().replace(' ', '_'))
    while len(textAsList[index]) < 200:
        textAsList[index].append(' ')
    # Encode in ASCII
    textAsList[index] = [ord(i) for i in textAsList[index]]

In [21]:
y = np.array(textAsList)

In [22]:
y

array([[65, 77, 69, ..., 32, 32, 32],
       [65, 77, 69, ..., 32, 32, 32],
       [65, 77, 69, ..., 32, 32, 32],
       ...,
       [77, 65, 89, ..., 32, 32, 32],
       [84, 82, 89, ..., 32, 32, 32],
       [65, 78, 68, ..., 32, 32, 32]])

In [None]:
# Plotting audio wave
plt.figure(figsize=(12, 4))
librosa.display.waveplot(data, sr = sampleRate)

In [None]:
test1 = data[0:320]

In [None]:
np.abs([i[0] for i in librosa.stft(data[:320], hop_length = 321)])

In [9]:
def getBands(data):
    return np.array([np.abs([j[0] for j in librosa.stft(data[i*320:(i+1)*320], hop_length = 321)]) for i in range(int(len(data)/320))])
#     for i in range(int(len(data)/320)):
#         test = data[i*320:(i+1)*320]
#         samples.append(np.abs(librosa.stft(test, hop_length = 321)))
        
#     return np.array(samples)

In [None]:
test1[0]

In [None]:
bands = getBands(data)

In [None]:
bands.shape

In [None]:
librosa.display.specshow(bands[0], y_axis = 'log', cmap='Spectral')

In [10]:
# Function to pad audio arrays to equal length for neural net
def padAudio(track, desired):
#     print(track.shape)
    
    padding = desired - track.shape[0]
    filler = np.array([0.0 for i in range(len(track[0]))])
#     .reshape(-1, 1)
    appendix = np.array([filler for i in range(padding)])
#     print(appendix.shape)
    
    return np.concatenate((track, appendix), axis = 0)

In [None]:
for i in range(160000 - data.shape[0]):
    data = np.concatenate((data, [0.0]), axis=0)

In [None]:
display.Audio(data, rate=16000)

In [None]:
data.shape

In [None]:
bands.shape

In [None]:
lel = padAudio(bands, 1000)

In [None]:
lel.reshape(1, 1000, 1025).shape

In [None]:
# reshaped = np.array(list(lel.reshape(-1,)))

In [None]:
# reshaped.shape

In [None]:
# plt.figure(figsize=(12, 4))
# librosa.display.waveplot(reshaped, sr = sampleRate)

In [None]:
# display.Audio(reshaped, rate = 44100)

In [None]:
maxLen = 0
for i in tqdm(range(1000)):
    loopData, _ = librosa.load(f'./data/cv-valid-train/cv-valid-train/sample-{"{:06d}".format(i)}.mp3', res_type = 'kaiser_fast', sr=16000)
    length = getBands(loopData).shape[0]
    if (length > maxLen):
        maxLen = length
        
print(maxLen)

In [None]:
print("{:06d}".format(100))

In [23]:
model = models.Sequential()
model.add(layers.Input(shape=(1, 1000, 1025)))
model.add(layers.Conv2D(32, kernel_size=[5,5], padding='same', activation='relu', data_format='channels_first'))
model.add(layers.MaxPooling2D(pool_size=[1,5], data_format='channels_first'))
model.add(layers.Conv2D(64, kernel_size=[5,5], padding='same', activation='relu', data_format='channels_first'))
model.add(layers.MaxPooling2D(pool_size=[1,5], data_format='channels_first'))
model.add(layers.Conv2D(128, kernel_size=[5,5], padding='same', activation='relu', data_format='channels_first'))
model.add(layers.MaxPooling2D(pool_size=[1,5], data_format='channels_first'))
model.add(layers.Conv2D(256, kernel_size=[5,5], padding='same', activation='relu', data_format='channels_first'))
model.add(layers.MaxPooling2D(pool_size=[1,5], data_format='channels_first'))
model.add(layers.Reshape((1000,256)))
model.add(layers.LSTM(256, activation='tanh', dropout=0.2, recurrent_dropout=0.2))
model.add(layers.Dense(200, activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 1000, 1025)    832       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 1000, 205)     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 1000, 205)     51264     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 1000, 41)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 128, 1000, 41)     204928    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 128, 1000, 8)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 256, 1000, 8)      8

In [None]:
model.fit(X, y, verbose=1)

Train on 100 samples


In [None]:
model.predict(X)