#### Imports e path dei dataset

In [1]:
import os
import pathlib
import backendHelper as b

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
import pandas as pd
import librosa
import librosa.display
import csv

from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from IPython import display

import importlib

# Percorsi dei file contenenti informazioni sull'associazione audio-trascrizione
audio_base_path = "Data\\it\\clips"
train_doc_path = "Data\\it\\train.tsv"
test_doc_path = "Data\\it\\test.tsv"
validation_doc_path = "Data\\it\\dev.tsv"

# Impostazione di un seed statico per riprodurre l'esperimento
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)



# Preprocessing

#### Caricamento dati

In [2]:
def load_data(df1, df2):
    audio_path = []
    transcriptions = []
    for audio in df1:
        audio_full_path = os.path.join(audio_base_path, audio)
        audio_path.append(audio_full_path)

    for transcript in df2:
        transcriptions.append(transcript)

    return audio_path, transcriptions

train_df = pd.read_csv(train_doc_path, sep='\t')
test_df = pd.read_csv(test_doc_path, sep='\t')
validation_df = pd.read_csv(validation_doc_path, sep='\t')

train_audio_path, train_audio_transcript = load_data(train_df['path'], train_df['sentence'])
test_audio_path, test_audio_transcript = load_data(test_df['path'], test_df['sentence'])
validation_audio_path, validation_audio_transcript = load_data(validation_df['path'], validation_df['sentence'])


#### Creazione del dataset di TRAIN con tracce audio e trascrizioni + zero padding

In [17]:
importlib.reload(b)
sample_rate = 48000
max_length = 200000
csv_file_path = "PROCESSED\\train.csv"
mfcc = []
transcription = []

for audio, t in zip(train_audio_path, train_audio_transcript):
    mfcc_features = b.extract_mfcc(audio)
    padded_mfcc = pad_sequences(mfcc_features, maxlen=max_length, padding='post', dtype='float32', value=0.0)
    mfcc.append(padded_mfcc.tolist())
    transcription.append(t)

data = {
    'mfcc': mfcc,
    'transcription': transcription
}

df = pd.DataFrame(data)
df.to_csv(csv_file_path, index=False, header=['mfcc', 'transcription'])

#### Creazione del dataset di TEST con tracce audio e trascrizioni + + zero padding

In [10]:
importlib.reload(b)
sample_rate = 48000
max_length = 200000
csv_file_path = "PROCESSED\\test.csv"
mfcc = []
transcription = []


for audio, t in zip(test_audio_path, test_audio_transcript):
    #waveform = b.get_waveform(audio)
    #spectrogram = b.audio_to_spectrogram(waveform, sample_rate)
    #spectrogram_str = np.array2string(spectrogram, separator=', ', threshold=np.inf)
    #writer.writerow({'waveform': spectrogram_str, 'label': t})
    mfcc_features = b.extract_mfcc(audio)
    padded_mfcc = pad_sequences(mfcc_features, maxlen=max_length, padding='post', dtype='float32', value=0.0)
    mfcc.append(padded_mfcc.tolist())
    transcription.append(t)

data = {
    'mfcc': mfcc,
    'transcription': transcription
}

df = pd.DataFrame(data)
df.to_csv(csv_file_path, index=False, header=['mfcc', 'transcription'])
'''
plt.imshow(mfcc_features, cmap='viridis', aspect='auto')
plt.xlabel('Tempo')
plt.ylabel('Frequenza')
plt.title('Spettrogramma')
plt.grid(False)

# Salva l'immagine
plt.savefig("SPECTROGRAMS\\test\\" + filename + '.png') 
'''

'\nplt.imshow(mfcc_features, cmap=\'viridis\', aspect=\'auto\')\nplt.xlabel(\'Tempo\')\nplt.ylabel(\'Frequenza\')\nplt.title(\'Spettrogramma\')\nplt.grid(False)\n\n# Salva l\'immagine\nplt.savefig("SPECTROGRAMS\\test\\" + filename + \'.png\') \n'

#### Creazione del dataset di VALIDATION con tracce audio e trascrizioni + zero padding

In [14]:
importlib.reload(b)
sample_rate = 48000
max_length = 200000
csv_file_path = "PROCESSED\\validation.csv"
mfcc = []
transcription = []

for audio, t in zip(validation_audio_path, validation_audio_transcript):
    mfcc_features = b.extract_mfcc(audio)
    padded_mfcc = pad_sequences(mfcc_features, maxlen=max_length, padding='post', dtype='float32', value=0.0)
    mfcc.append(padded_mfcc.tolist())
    transcription.append(t)

padded_mfcc = pad_sequences(mfcc, maxlen=max_length, padding='post', dtype='float32', value=0.0)
data = {
    'mfcc': mfcc,
    'transcription': transcription
}

df = pd.DataFrame(data)
df.to_csv(csv_file_path, index=False, header=['mfcc', 'transcription'])

# Modellazione e Addestramento

#### Caricamento dei dataset

In [2]:
processed_train_path = "PROCESSED/train.csv"
processed_test_path = "PROCESSED/test.csv"
processed_validation_path = "PROCESSED/validation.csv"

processed_train_df = pd.read_csv(processed_train_path)
processed_test_df = pd.read_csv(processed_test_path)
processed_validation_df = pd.read_csv(processed_validation_path)

#### Padding per la definizione di input_shape

In [20]:
#mfcc_matrix = processed_train_df[0].values.astype(float)
#print(processed_test_df['mfcc'])
#print(processed_validation_df['transcription'])
print(len(processed_train_df['mfcc'][14]))
max_length = 200000



148210


#### Creazione modello CNN

In [1]:
num_labels = len(processed_train_df['transcription'])
input_shape = 200000
model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

NameError: name 'LSTMmodel' is not defined

#### Compilazione del modello

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

#### Addestramento

In [None]:
EPOCHS = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)