# Model training and evaluation

Link to datatset: [open here](https://bark.phon.ioc.ee/voxlingua107/)

Python 3.12.7 recommended

Folder structure:
```
├── data
│   ├── en
│   │   ├── en_1.wav
│   │   ├── en_2.wav
│   │   ├── ...
│   ├── fr
│   │   ├── fr_1.wav
│   │   ├── fr_2.wav
│   │   ├── ...
│   ├── ...
language.ipynb
```

In [None]:
# DOWNLOAD VOXLINGUA107 DATASET WITH THIS SCRIPT

import os
import requests
from tqdm import tqdm
from multiprocessing import Pool
from zipfile import ZipFile

# Base URL for VoxLingua107 zip files
base_url = "https://bark.phon.ioc.ee/voxlingua107/"

# Directory to store the downloaded data
data_dir = "data"

# Function to download and extract a language zip file
def download_and_extract(url):
    lang_code = url.split("/")[-1].split(".")[0]
    lang_dir = os.path.join(data_dir, lang_code)

    # Create directory for the language if it doesn't exist
    if not os.path.exists(lang_dir):
        os.makedirs(lang_dir)

    # Path to save the zip file
    zip_path = os.path.join(lang_dir, f"{lang_code}.zip")

    # Download the zip file
    if not os.path.exists(zip_path):
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        with open(zip_path, 'wb') as f:
            for data in tqdm(response.iter_content(1024), total=total_size // 1024, unit='KB'):
                f.write(data)

    # Extract the zip file
    with ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(lang_dir)

    # Remove the zip file after extraction
    os.remove(zip_path)

# Function to download the language data for the specified languages
def download_languages(languages):
    # Get the URLs of all language zip files
    url_list = requests.get(base_url + "zip_urls.txt").text.splitlines()

    # Filter the URLs for the specified languages
    selected_urls = [url for url in url_list if any(lang in url for lang in languages)]

    # Download and extract the selected languages in parallel
    with Pool(4) as p:
        p.map(download_and_extract, selected_urls)

if __name__ == "__main__":
    # Example: list of language codes to download (e.g., "en" for English, "fr" for French)
    languages_to_download = ["en", "fr", "de", "es", "it", "pt", "ru", "tr"]  # Add or modify the languages here

    download_languages(languages_to_download)


In [12]:
# Install libraries
# %pip install tensorflow, numpy, librosa, numpy

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
from tensorflow.keras.models import load_model
import tqdm
import datetime

In [3]:
# Define your folder structure
data_dir = 'data'
classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
print('Classes:', classes)

print("Converting all audio files to .wav format...")
# convert all audio files to .wav format
for class_name in classes:
    class_dir = os.path.join(data_dir, class_name)
    for filename in os.listdir(class_dir):
        if filename.endswith('.wav'):
            continue
        file_path = os.path.join(class_dir, filename)
        os.system(f'ffmpeg -i {file_path} {file_path[:-4]}.wav')
        os.remove(file_path)
print("Conversion complete.")

# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(128, 128), max_files_per_class=100):
    print('Loading and preprocessing data...')
    data = []
    labels = []
    
    for i, class_name in enumerate(classes):
        print(f'Processing class {class_name} ({i+1}/{len(classes)})...')
        class_dir = os.path.join(data_dir, class_name)
        # show progress bar
        file_count = 0
        for filename in tqdm.tqdm(os.listdir(class_dir)):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                # Perform preprocessing (e.g., convert to Mel spectrogram and resize)
                mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
                mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
                data.append(mel_spectrogram)
                labels.append(i)
                file_count += 1
                if file_count >= max_files_per_class:
                    break
    print('Data loading and preprocessing complete.')
    return np.array(data), np.array(labels)

# Split data into training and testing sets
data, labels = load_and_preprocess_data(data_dir, classes, target_shape=(128, 128), max_files_per_class=5000)
labels = to_categorical(labels, num_classes=len(classes))  # Convert labels to one-hot encoding
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
print("Loaded data shape:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("Data loaded and preprocessed.")

Classes: ['de', 'en', 'es', 'fr', 'it', 'pt', 'ru', 'tr']
Converting all audio files to .wav format...
Conversion complete.
Loading and preprocessing data...
Processing class de (1/8)...


 39%|███▉      | 4999/12864 [02:13<03:30, 37.32it/s]


Processing class en (2/8)...


 32%|███▏      | 4999/15860 [03:04<06:40, 27.10it/s]


Processing class es (3/8)...


 39%|███▉      | 4999/12837 [03:33<05:35, 23.36it/s]


Processing class fr (4/8)...


 20%|██        | 4999/24390 [03:42<14:21, 22.50it/s]


Processing class it (5/8)...


 26%|██▋       | 4999/19028 [04:38<13:01, 17.94it/s]  


Processing class pt (6/8)...


 21%|██        | 4999/24162 [05:00<19:13, 16.62it/s]  


Processing class ru (7/8)...


 21%|██        | 4999/23741 [06:21<23:52, 13.09it/s]  


Processing class tr (8/8)...


 22%|██▏       | 4999/22718 [06:34<23:17, 12.68it/s]


Data loading and preprocessing complete.
Loaded data shape: (32000, 128, 128, 1) (8000, 128, 128, 1) (32000, 8) (8000, 8)
Data loaded and preprocessed.


In [39]:
print("Creating a neural network model...")
# Create a neural network model
input_shape = X_train[0].shape
input_layer = Input(shape=input_shape)

# First Conv2D layer with more filters
x = Conv2D(64, (3, 3), activation='relu')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

# Second Conv2D layer
x = Conv2D(128, (3, 3), activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

# Third Conv2D layer
x = Conv2D(256, (3, 3), activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

# Global Average Pooling instead of Flatten
x = GlobalAveragePooling2D()(x)

# Dense layers
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

# Output layer
output_layer = Dense(len(classes), activation='softmax')(x)

# Model
model = Model(input_layer, output_layer)
print(model.summary())


Creating a neural network model...


None


In [40]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [4]:
class EarlyStoppingByAccuracyDiff(tf.keras.callbacks.Callback):
    def __init__(self, threshold=0.2):
        super(EarlyStoppingByAccuracyDiff, self).__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        train_acc = logs.get('accuracy')
        val_acc = logs.get('val_accuracy')

        if train_acc is not None and val_acc is not None:
            if (train_acc - val_acc) > self.threshold:
                print(f"\nStopping training: train_acc={train_acc}, val_acc={val_acc}, diff={train_acc - val_acc}")
                self.model.stop_training = True

In [5]:
# Define the checkpoint callback to save the latest model
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='latest_model.keras',  # Use the .keras extension
    save_best_only=True,  # Save the best model only at the end of each epoch
    save_weights_only=False,  # Save the entire model
    monitor='val_accuracy',
    verbose=1
)

In [6]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [7]:
model = load_model('latest_model.keras')

In [8]:
early_stop_callback = EarlyStoppingByAccuracyDiff(threshold=0.2)

# Train the model
model.fit(X_train, y_train, epochs=80, batch_size=32, validation_data=(X_test, y_test), callbacks=[tensorboard_callback, checkpoint_callback, early_stop_callback])

Epoch 1/80
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8134 - loss: 0.5653
Epoch 1: val_accuracy improved from -inf to 0.65325, saving model to latest_model.keras
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1406s[0m 1s/step - accuracy: 0.8134 - loss: 0.5653 - val_accuracy: 0.6532 - val_loss: 1.1747
Epoch 2/80
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8217 - loss: 0.5430
Epoch 2: val_accuracy improved from 0.65325 to 0.65812, saving model to latest_model.keras
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1378s[0m 1s/step - accuracy: 0.8217 - loss: 0.5430 - val_accuracy: 0.6581 - val_loss: 1.1216
Epoch 3/80
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8309 - loss: 0.5155
Epoch 3: val_accuracy did not improve from 0.65812
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1334s[0m 1s/step - accuracy: 0.8309

<keras.src.callbacks.history.History at 0x1cafd608ef0>

In [13]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 8172), started 1 day, 11:57:12 ago. (Use '!kill 8172' to kill it.)

In [10]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.6330000162124634


In [11]:
# Save the model
model.save('audio_classification_model.keras')

# Model prediction on audio

In [20]:
# Load the saved model
model = load_model('audio_classification_model.keras')

target_shape = (128, 128)

# Function to preprocess and classify an audio file
def test_audio(file_path, model):
    # Load and preprocess the audio file
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))
    
    # Make predictions
    predictions = model.predict(mel_spectrogram)
    
    # Get the class probabilities
    class_probabilities = predictions[0]
    
    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)
    
    return class_probabilities, predicted_class_index

# Test an audio file
test_audio_file = "data/es/yqUcDCVhytk__U__S11---0082.030-0097.400.wav"
class_probabilities, predicted_class_index = test_audio(test_audio_file, model)

# Display results for all classes
for i, class_label in enumerate(classes):
    probability = class_probabilities[i]
    print(f'Class: {class_label}, Probability: {probability:.4f}')

# Calculate and display the predicted class and accuracy
predicted_class = classes[predicted_class_index]
accuracy = class_probabilities[predicted_class_index]
print(f'The audio is classified as: {predicted_class}')
print(f'Accuracy: {accuracy:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
Class: de, Probability: 0.0022
Class: en, Probability: 0.1293
Class: es, Probability: 0.6672
Class: fr, Probability: 0.0000
Class: it, Probability: 0.0095
Class: pt, Probability: 0.1903
Class: ru, Probability: 0.0000
Class: tr, Probability: 0.0015
The audio is classified as: es
Accuracy: 0.6672
