### Import Libraries

In [1]:
import pandas as pd
import scipy.fftpack
from scipy.fftpack import dct
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os

### File Paths

In [2]:
protocol_eval_file = '../Dataset/ASVSpoof/actual/protocol_V2/ASVspoof2017_V2_dev.trl.txt'
folder_eval_file = '../Dataset/ASVSpoof/actual/ASVspoof2017_V2_dev'
df_eval = pd.read_csv(protocol_eval_file, sep=' ')

### File Check

In [3]:
print(df_eval.head())

         file_id     type speaker_id phrase_id environment_id playback_id  \
0  D_1000001.wav  genuine      M0011       S06              -           -   
1  D_1000002.wav  genuine      M0011       S08              -           -   
2  D_1000003.wav  genuine      M0011       S04              -           -   
3  D_1000004.wav  genuine      M0011       S07              -           -   
4  D_1000005.wav  genuine      M0011       S10              -           -   

  recording_id  
0            -  
1            -  
2            -  
3            -  
4            -  


In [4]:
df_sorted = df_eval.sort_values(by='speaker_id')

In [5]:
print(df_sorted.head())

            file_id     type speaker_id phrase_id environment_id playback_id  \
0     D_1000001.wav  genuine      M0011       S06              -           -   
1271  D_1001272.wav    spoof      M0011       S06            E06         P09   
1281  D_1001282.wav    spoof      M0011       S02            E06         P09   
1283  D_1001284.wav    spoof      M0011       S03            E06         P09   
1292  D_1001293.wav    spoof      M0011       S02            E06         P09   

     recording_id  
0               -  
1271          R07  
1281          R07  
1283          R07  
1292          R07  


In [6]:
unique_names_count = df_sorted['speaker_id'].nunique()
print(f"Unique count of names: {unique_names_count}")

Unique count of names: 8


In [7]:
eval_labels = df_sorted['type'].map({'genuine' : 1, 'spoof' : 0})

In [8]:
print(df_sorted.head())

            file_id     type speaker_id phrase_id environment_id playback_id  \
0     D_1000001.wav  genuine      M0011       S06              -           -   
1271  D_1001272.wav    spoof      M0011       S06            E06         P09   
1281  D_1001282.wav    spoof      M0011       S02            E06         P09   
1283  D_1001284.wav    spoof      M0011       S03            E06         P09   
1292  D_1001293.wav    spoof      M0011       S02            E06         P09   

     recording_id  
0               -  
1271          R07  
1281          R07  
1283          R07  
1292          R07  


In [9]:
# Step 2: Create a list of 30 identities and randomly select 24
# identities = [f'{i}' for i in range(1, 31)]  # Example IDs: ID_1, ID_2, ..., ID_30
# selected_identities = np.random.choice(identities, 24, replace=False)

In [10]:
# print(selected_identities)

In [11]:
selected_identities = [1, 2, 11, 17]

In [12]:
# Step 3: Create a mapping from speaker IDs to the selected identities
# Assuming you have 24 unique speaker IDs in the protocol DataFrame
unique_speaker_ids = df_sorted['speaker_id'].unique()
identity_mapping = dict(zip(unique_speaker_ids, selected_identities))

In [13]:
print(identity_mapping)

{'M0011': 1, 'M0012': 2, 'M0013': 11, 'M0014': 17}


In [14]:
df_sorted['speaker_id'] = df_sorted['speaker_id'].replace(identity_mapping)

In [15]:
print(df_sorted)

            file_id     type speaker_id phrase_id environment_id playback_id  \
0     D_1000001.wav  genuine          1       S06              -           -   
1271  D_1001272.wav    spoof          1       S06            E06         P09   
1281  D_1001282.wav    spoof          1       S02            E06         P09   
1283  D_1001284.wav    spoof          1       S03            E06         P09   
1292  D_1001293.wav    spoof          1       S02            E06         P09   
...             ...      ...        ...       ...            ...         ...   
1496  D_1001497.wav    spoof      M0018       S08            E04         P06   
1495  D_1001496.wav    spoof      M0018       S01            E04         P06   
1494  D_1001495.wav    spoof      M0018       S05            E04         P06   
659   D_1000660.wav  genuine      M0018       S04              -           -   
690   D_1000691.wav  genuine      M0018       S01              -           -   

     recording_id  
0               -  

In [16]:
# Define how many of each label you want
num_genuine = 4
num_spoof = 8

In [17]:
selected_data = pd.DataFrame()

In [18]:
# Loop over each speaker_id
for speaker in df_sorted['speaker_id'].unique():
    # Filter for the current speaker
    speaker_data = df_sorted[df_sorted['speaker_id'] == speaker]
    
    # Select genuine and spoof samples
    genuine_samples = speaker_data[speaker_data['type'] == 'genuine'].sample(n=min(num_genuine, len(speaker_data[speaker_data['type'] == 'genuine'])), random_state=1)
    spoof_samples = speaker_data[speaker_data['type'] == 'spoof'].sample(n=min(num_spoof, len(speaker_data[speaker_data['type'] == 'spoof'])), random_state=1)
    
    # Append to the selected_data DataFrame
    selected_data = pd.concat([selected_data, genuine_samples, spoof_samples])

# Display the resulting DataFrame
print("\nSelected DataFrame with 4 genuines and 8 spoofs per speaker:")
print(selected_data)


Selected DataFrame with 4 genuines and 8 spoofs per speaker:
            file_id     type speaker_id phrase_id environment_id playback_id  \
21    D_1000022.wav  genuine          1       S02              -           -   
32    D_1000033.wav  genuine          1       S03              -           -   
61    D_1000062.wav  genuine          1       S03              -           -   
114   D_1000115.wav  genuine          1       S09              -           -   
1554  D_1001555.wav    spoof          1       S08            E05         P01   
...             ...      ...        ...       ...            ...         ...   
1679  D_1001680.wav    spoof      M0018       S06            E03         P08   
1508  D_1001509.wav    spoof      M0018       S09            E04         P06   
1204  D_1001205.wav    spoof      M0018       S06            E06         P09   
1324  D_1001325.wav    spoof      M0018       S02            E06         P09   
807   D_1000808.wav    spoof      M0018       S07         

In [19]:
# Specify the attributes you want to check
attributes_to_check = selected_identities

# Count the occurrences of the specified attributes in the 'label' column
count_results = selected_data[selected_data['speaker_id'].isin(attributes_to_check)].groupby('speaker_id').size()

# Display the results
print(count_results)

speaker_id
1     12
2     12
11    12
17    12
dtype: int64


In [26]:
criteria = [1, 2, 11, 17]

In [27]:
selected_data = selected_data[selected_data['speaker_id'].isin(criteria)]

In [29]:
print(selected_data.shape)

(48, 7)


In [28]:
selected_data.to_csv('./multimodal-data/new_protocol_val.csv', index=False)  # index=False to exclude the index column

In [30]:
test = pd.read_csv('./multimodal-data/new_protocol_val.csv')

In [31]:
print(test)

          file_id     type  speaker_id phrase_id environment_id playback_id  \
0   D_1000022.wav  genuine           1       S02              -           -   
1   D_1000033.wav  genuine           1       S03              -           -   
2   D_1000062.wav  genuine           1       S03              -           -   
3   D_1000115.wav  genuine           1       S09              -           -   
4   D_1001555.wav    spoof           1       S08            E05         P01   
5   D_1001174.wav    spoof           1       S02            E06         P09   
6   D_1001250.wav    spoof           1       S10            E06         P09   
7   D_1000948.wav    spoof           1       S02            E16         P07   
8   D_1001682.wav    spoof           1       S03            E03         P08   
9   D_1001635.wav    spoof           1       S01            E03         P08   
10  D_1000904.wav    spoof           1       S03            E16         P07   
11  D_1001294.wav    spoof           1       S09    

In [32]:
import os
import shutil

In [35]:
source_folder = '../Dataset/ASVSpoof/actual/ASVspoof2017_V2_dev'  # Change to your source folder
destination_folder = '../Dataset/Mix/Validation/Voice'  # Change to your destination folder

In [36]:
# Iterate over the file_id column and copy files
for file_id in test['file_id']:
    source_file_path = os.path.join(source_folder, file_id)
    if os.path.exists(source_file_path):
        shutil.copy(source_file_path, destination_folder)
        print(f"Copied: {file_id}")
    else:
        print(f"File not found: {file_id}")

Copied: D_1000022.wav
Copied: D_1000033.wav
Copied: D_1000062.wav
Copied: D_1000115.wav
Copied: D_1001555.wav
Copied: D_1001174.wav
Copied: D_1001250.wav
Copied: D_1000948.wav
Copied: D_1001682.wav
Copied: D_1001635.wav
Copied: D_1000904.wav
Copied: D_1001294.wav
Copied: D_1000162.wav
Copied: D_1000147.wav
Copied: D_1000152.wav
Copied: D_1000150.wav
Copied: D_1000854.wav
Copied: D_1001045.wav
Copied: D_1000817.wav
Copied: D_1001329.wav
Copied: D_1001140.wav
Copied: D_1001102.wav
Copied: D_1001236.wav
Copied: D_1001710.wav
Copied: D_1000197.wav
Copied: D_1000219.wav
Copied: D_1000200.wav
Copied: D_1000194.wav
Copied: D_1001641.wav
Copied: D_1001363.wav
Copied: D_1001078.wav
Copied: D_1001648.wav
Copied: D_1001409.wav
Copied: D_1001599.wav
Copied: D_1001451.wav
Copied: D_1000762.wav
Copied: D_1000318.wav
Copied: D_1000316.wav
Copied: D_1000326.wav
Copied: D_1000309.wav
Copied: D_1001607.wav
Copied: D_1001297.wav
Copied: D_1001677.wav
Copied: D_1001107.wav
Copied: D_1000822.wav
Copied: D_