<a href="https://colab.research.google.com/github/Amrutha369/voice-spoofing-detection/blob/main/voice_spoofing_detection_using_cnn_Multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [3]:

!pip install tensorflow-io
import os
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt #plotting library
%matplotlib inline
import numpy as np #numerical computing
import tensorflow as tf
import tensorflow_io as tfio
import seaborn as sns # statistical data visualization
from IPython.display import Audio
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

Collecting tensorflow-io
  Downloading tensorflow_io-0.36.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.36.0


In [4]:
# Audio params
SAMPLE_RATE = 16000
DURATION = 20.0 # duration in second
AUDIO_LEN = int(SAMPLE_RATE * DURATION)

# Spectrogram params
N_MELS = 128 # freq axis
N_FFT = 2048
SPEC_WIDTH = 256 #
HOP_LEN = 512 # non-overlap region
FMAX = SAMPLE_RATE//2 # max frequency

# CNN params
NUM_CLASSES = 3 # bonafide or spoof
BATCH_SIZE = 16 # The number of samples processed in each training batch
EPOCHS = 100 # the number of times the entire dataset is passed forward and backward through the neural network during training.
LEARNING_RATE = 0.0001 # adjust based on your model performance


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
# Directory and protocol definitions
base_path = '/content/drive/MyDrive/dataset/LA'
protocol_dir = os.path.join(base_path, 'ASVspoof2019_LA_cm_protocols')
train_dir = os.path.join(base_path, 'ASVspoof2019_LA_train', 'flac')
dev_dir = os.path.join(base_path, 'ASVspoof2019_LA_dev', 'flac')
eval_dir = os.path.join(base_path, 'ASVspoof2019_LA_eval', 'flac')

In [7]:
# Function to form the full path of a file
def get_file_path(directory, filename):
    return os.path.join(directory, f'{filename}.flac')

In [8]:
# Function to read the dataset
def read_dataset(protocol_path, directory):
    """Reads the dataset from a protocol file and returns a DataFrame."""
    df = pd.read_csv(protocol_path, sep=' ', header=None, names=['speaker_id', 'filename', 'system_id', 'null', 'class_name'])
    df['filepath'] = df['filename'].apply(lambda x: get_file_path(directory, x))
    df.drop('null', axis=1, inplace=True)
    df.dropna(inplace=True)
    return df

# Function to convert class_name to integer
def label_to_int(class_name):
    if class_name == 'real':
        return 0
    elif class_name == 'synthesised':
        return 1
    elif class_name == 'converted':
        return 2
    else:
        return -1  # Handle unknown classes

# Add target column and subset to DataFrame
def add_columns(df, subset):
    df['target'] = df['class_name'].apply(label_to_int)
    df['subset'] = subset
    return df

# Take samples from each DataFrame
def sample_data_multiclass(df, n_synthesised, n_bonafide, n_converted):
    synthesised = df[df['class_name'] == 'synthesised'].head(n_synthesised)
    bonafide = df[df['class_name'] == 'real'].head(n_bonafide)
    converted = df[df['class_name'] == 'converted'].head(n_converted)
    return pd.concat([synthesised, bonafide, converted])

# Directory and protocol definitions
base_path = '/content/drive/MyDrive/dataset/LA'
protocol_dir = os.path.join(base_path, 'ASVspoof2019_LA_cm_protocols')
train_dir = os.path.join(base_path, 'ASVspoof2019_LA_train', 'flac')
dev_dir = os.path.join(base_path, 'ASVspoof2019_LA_dev', 'flac')
eval_dir = os.path.join(base_path, 'ASVspoof2019_LA_eval', 'flac')

# Create DataFrames for each dataset
train_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.train.trn.txt'), train_dir)
dev_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.dev.trl.txt'), dev_dir)
eval_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.eval.trl.txt'), eval_dir)

# Add columns to each DataFrame
train_df = add_columns(train_df, 'train')
dev_df = add_columns(dev_df, 'dev')
eval_df = add_columns(eval_df, 'eval')

# Take samples from each DataFrame by a different number
train_df = sample_data_multiclass(train_df, 525, 70, 70)
dev_df = sample_data_multiclass(dev_df, 150, 15, 15)
eval_df = sample_data_multiclass(eval_df, 150, 15, 15)

In [9]:
#Combine three dataframes into one dataframe
data_df = pd.concat([train_df, dev_df, eval_df], ignore_index=True)

In [10]:
data_df.head(len(data_df))

Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target,subset
0,LA_0098,LA_T_1000648,-,synthesised,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,1,train
1,LA_0090,LA_T_1001169,-,synthesised,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,1,train
2,LA_0098,LA_T_1001718,-,synthesised,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,1,train
3,LA_0085,LA_T_1002656,-,synthesised,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,1,train
4,LA_0085,LA_T_1004407,-,synthesised,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,1,train
...,...,...,...,...,...,...,...
444,LA_0098,LA_T_8794062,-,converted,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,2,eval
445,LA_0098,LA_T_8806933,-,converted,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,2,eval
446,LA_0098,LA_T_8827497,-,converted,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,2,eval
447,LA_0098,LA_T_8858210,-,converted,/content/drive/MyDrive/dataset/LA/ASVspoof2019...,2,eval


In [11]:
import librosa

def audio_to_spectrogram(filepath):
    # Load audio file using librosa
    audio, sample_rate = librosa.load(filepath)

    # Normalize the audio
    audio_norm = (audio - np.min(audio)) / (np.max(audio) - np.min(audio))
    # A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time.
    # Compute the spectrogram using librosa
    spectrogram = librosa.feature.melspectrogram(y=audio_norm, sr=sample_rate)

    # Convert the spectrogram to decibels
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    # Add channel dimension
    spectrogram_db = np.expand_dims(spectrogram_db, axis=-1)

    # Resize spectrogram to the specified size
    image = tf.image.resize(spectrogram_db, [SPEC_WIDTH, N_MELS])

    return image

# Adding the spectrogram column to the DataFrame
data_df['spectrogram'] = data_df['filepath'].apply(audio_to_spectrogram)


In [12]:
# Splitting the data based on the 'subset' column
train_data = data_df[data_df['subset'] == 'train']
dev_data = data_df[data_df['subset'] == 'dev']
eval_data = data_df[data_df['subset'] == 'eval']

# Converting the list of spectrograms into NumPy arrays
X_train = np.stack(train_data['spectrogram'].to_list())
X_dev = np.stack(dev_data['spectrogram'].to_list())
X_eval = np.stack(eval_data['spectrogram'].to_list())

# Converting target into NumPy arrays
y_train = train_data['target'].to_numpy()
y_dev = dev_data['target'].to_numpy()
y_eval = eval_data['target'].to_numpy()

In [13]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")#Contains target labels
print(f"X_dev shape: {X_dev.shape}")
print(f"y_dev shape: {y_dev.shape}")
print(f"X_eval shape: {X_eval.shape}")
print(f"y_eval shape: {y_eval.shape}")

X_train shape: (315, 256, 128, 1)
y_train shape: (315,)
X_dev shape: (67, 256, 128, 1)
y_dev shape: (67,)
X_eval shape: (67, 256, 128, 1)
y_eval shape: (67,)


In [17]:
from tensorflow.keras.optimizers import Adam

def cnn_model(input_shape, num_classes):
    model = Sequential()
    # 1st conv layer
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(BatchNormalization())
    # 2nd conv layer
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    model.add(BatchNormalization())
    # 3rd conv layer
    model.add(Conv2D(32, (2, 2), activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    model.add(BatchNormalization())
    # 4th conv layer
    model.add(Conv2D(64, (2, 2), activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    # Additional dense layer
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    # Output layer for multiclass classification
    model.add(Dense(num_classes, activation='softmax'))
    # Compile the model with Adam optimizer
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Convert target labels to one-hot encoding for multiclass classification
from tensorflow.keras.utils import to_categorical

# Convert target labels to one-hot encoding
y_train_categorical = to_categorical(y_train, num_classes=NUM_CLASSES)
y_dev_categorical = to_categorical(y_dev, num_classes=NUM_CLASSES)
y_eval_categorical = to_categorical(y_eval, num_classes=NUM_CLASSES)

# Create the multiclass CNN model with Adam optimizer
multiclass_cnn_model = create_multiclass_cnn_model((X_train.shape[1], X_train.shape[2], 1), NUM_CLASSES)

# Train the multiclass CNN model
multiclass_cnn_model.fit(X_train, y_train_categorical, epochs=50, batch_size=16)

# Compute accuracy on training data
train_accuracy = multiclass_cnn_model.evaluate(X_train, y_train_categorical)[1]
print(f"Accuracy on training data: {train_accuracy * 100:.2f}%")

# Compute accuracy on validation data
val_accuracy = multiclass_cnn_model.evaluate(X_dev, y_dev_categorical)[1]
print(f'Accuracy on validation data: {val_accuracy * 100:.2f}%')

# Compute accuracy on test data
test_accuracy = multiclass_cnn_model.evaluate(X_eval, y_eval_categorical)[1]
print(f'Accuracy on test data: {test_accuracy * 100:.2f}%')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy on training data: 100.00%
Accuracy on validation data: 97.01%
Accuracy on test data: 97.01%


In [23]:
# Save the model
multiclass_cnn_model.save('/content/drive/MyDrive/save/voice-spoofing-detection-using-cnn-Multiclass.h5')

  saving_api.save_model(


In [26]:
import os
from tensorflow.keras.models import load_model
sample_audio_file_path = "/content/drive/MyDrive/dataset/LA/ASVspoof2019_LA_eval/flac/LA_T_6904517.flac"

print("Sample audio file:", sample_audio_file_path)
model_path = "/content/drive/MyDrive/save/voice-spoofing-detection-using-cnn-Multiclass.h5"  # Replace with actual path
multiclass_cnn_model = load_model(model_path)
# Creating spectrogram for the sample audio file
sample_audio_spectrogram = audio_to_spectrogram(sample_audio_file_path)
X_new = np.expand_dims(sample_audio_spectrogram, axis=0)

# Performing prediction
y_pred = multiclass_cnn_model.predict(X_new)

# Convert probabilities to classes by selecting the class with the highest probability
y_pred_classes = np.argmax(y_pred, axis=1)

# Map predicted classes to labels
class_labels = ['real', 'synthesised', 'converted']  # Update with your actual class labels
y_pred_labels = [class_labels[pred] for pred in y_pred_classes]
print("Predicted class:", y_pred_labels[0])

Sample audio file: /content/drive/MyDrive/dataset/LA/ASVspoof2019_LA_eval/flac/LA_T_6904517.flac
Predicted class: real
