In [1]:
# Data Loading 
import numpy as np
import pandas as pd
from tqdm import tqdm

base_path = './MLEnd/deception/MLEndDD_stories_small/'
MLEND_df = pd.read_csv('./MLEnd/deception/MLEndDD_story_attributes_small.csv').set_index('filename')

files = [base_path + file for file in MLEND_df.index]

print(f"We have {len(files)} audio files in the dataset.")
display(MLEND_df.head())

#Langauge and Data Distribution
language_counts = MLEND_df['Language'].value_counts()
language_df = pd.DataFrame(language_counts).transpose()
language_df['Sum'] = language_counts.sum()

story_type_counts = MLEND_df['Story_type'].value_counts()

print("Languages narrated in the dataset are:")
display(language_df)
print(story_type_counts)

We have 100 audio files in the dataset.


Unnamed: 0_level_0,Language,Story_type
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
00001.wav,Hindi,deceptive_story
00002.wav,English,true_story
00003.wav,English,deceptive_story
00004.wav,Bengali,deceptive_story
00005.wav,English,deceptive_story


Languages narrated in the dataset are:


Language,English,Hindi,Arabic,"Chinese, Mandarin",Marathi,Bengali,Kannada,French,Russian,Portuguese,Spanish,Swahilli,Telugu,Korean,Cantonese,Italian,Sum
count,78,4,3,2,2,1,1,1,1,1,1,1,1,1,1,1,100


Story_type
deceptive_story    50
true_story         50
Name: count, dtype: int64


In [None]:
import librosa

def split_audio(file_id, file_path, label, chunk_duration=30, sr=None):
    """
    Splits an audio file into fixed-length chunks, analyzes valid and non-valid chunks,
    and calculates data loss during splitting.

    Args:
        file_id (str): The name of the audio file (e.g., '00001.wav').
        file_path (str): Path to the audio file.
        label (str): The label for the audio file (e.g., 'true_story' or 'deceptive_story').
        chunk_duration (int): Duration of each chunk in seconds (default is 30).
        sr (int or None): Sampling rate. If None, the original rate is used.

    Returns:
        dict: Metadata about the file, including total duration, valid and non-valid chunk counts, and data loss.
        list: Information about each valid chunk, including its ID and label.
    """
    audio_data, sample_rate = librosa.load(file_path, sr=sr)  # sr=None uses original sample rate
    
    total_duration = len(audio_data) / sample_rate #in seconds
    chunk_size = int(chunk_duration * sample_rate)

    # Split the audio into chunks of `chunk_size`
    chunks = [audio_data[i:i + chunk_size] for i in range(0, len(audio_data), chunk_size)]
    
    valid_chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]
    non_valid_chunks = [chunk for chunk in chunks if len(chunk) < chunk_size]

    metadata = {
        "File ID": file_id,
        "Duration (s)": total_duration,
        "Sample Rate": sample_rate,
        "Total Chunks": len(chunks),
        "Valid Chunks (30s)": len(valid_chunks),
        "Non-Valid Chunks (<30s)": len(non_valid_chunks),
        "Label": label
    }

    chunk_info = [
        {"File ID": file_id, "Chunk ID": f"{file_id}_chunk{i + 1}", "Chunk Data": chunk, "Label": label, "Sample Rate": sample_rate}
        for i, chunk in enumerate(valid_chunks)
    ]

    return metadata, chunk_info


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Split audio into 30s
file_metadata = []
audio_chunks = []

for file_id in tqdm(MLEND_df.index):  
    file_path = base_path + file_id   
    label = MLEND_df.loc[file_id, 'Story_type'] 

    metadata, chunks = split_audio(file_id, file_path, label)
    file_metadata.append(metadata) 
    audio_chunks.extend(chunks)     

# Metadata DF
metadata_df = pd.DataFrame(file_metadata)
print("Summary of Audio Files:")
display(metadata_df.head())

# Chunk Info DF
chunks_df = pd.DataFrame(audio_chunks)
print("Summary of Valid Audio Chunks:")
display(chunks_df[["File ID", "Chunk ID", "Label"]].head())


# Summary statistics
print("\nSummary Statistics:")
print(f"Total Files Processed: {len(metadata_df)}")
print(f"Total Chunks Created: {metadata_df['Total Chunks'].sum()}")
print(f"Total Valid Chunks(30s): {metadata_df['Valid Chunks (30s)'].sum()}")
print(f"Total Non-Valid Chunks (<30s): {metadata_df['Non-Valid Chunks (<30s)'].sum()}")

# True and Deceptive Distribution after splitting
valid_chunk_labels = chunks_df['Label'].value_counts()
print("\nCount of True and Deceptive Stories from Valid Chunks (30s):")
for label, count in valid_chunk_labels.items():
    print(f"{label}: {count} chunks")

100%|██████████| 100/100 [00:05<00:00, 19.76it/s]

Summary of Audio Files:





Unnamed: 0,File ID,Duration (s),Sample Rate,Total Chunks,Valid Chunks (30s),Non-Valid Chunks (<30s),Label
0,00001.wav,122.167256,44100,5,4,1,deceptive_story
1,00002.wav,125.192018,44100,5,4,1,true_story
2,00003.wav,162.984127,44100,6,5,1,deceptive_story
3,00004.wav,121.68127,44100,5,4,1,deceptive_story
4,00005.wav,134.189751,44100,5,4,1,deceptive_story


Summary of Valid Audio Chunks:


Unnamed: 0,File ID,Chunk ID,Label
0,00001.wav,00001.wav_chunk1,deceptive_story
1,00001.wav,00001.wav_chunk2,deceptive_story
2,00001.wav,00001.wav_chunk3,deceptive_story
3,00001.wav,00001.wav_chunk4,deceptive_story
4,00002.wav,00002.wav_chunk1,true_story



Summary Statistics:
Total Files Processed: 100
Total Chunks Created: 520
Total Valid Chunks(30s): 420
Total Non-Valid Chunks (<30s): 100

Count of True and Deceptive Stories from Valid Chunks (30s):
true_story: 219 chunks
deceptive_story: 201 chunks


Dataset

A total of 100 audio recordings - 50 true and 50 deceptive 
16 languages - English is the most dominant, followed by Hindi, Arabic, Chinese (Mandarin), Marathi and other languages as shown in the code above. 

After splitted the audio into 30s chunks and discard those less than 30s chunks to remain consistency. As a result,we have remaning of 420 valid 30s chunks, and 219 of them are true, 201 are false. This results in the following proportions:

True Stories: 
219/420 ≈52.14%

Deceptive Stories: 
201/420 ≈47.86%

Hence, this dataset is generally considered balanced as both classes (true and deceptive) are roughly equal.

### Feature extraction

In [None]:
def extract_features_and_labels(audio_chunks, scale_audio=True):
    """
    Extract MFCC, Pitch, Energy, and ZCR features from audio chunks and associate labels.

    Args:
        audio_chunks (list): List of dictionaries with keys ['Chunk ID', 'Chunk Data', 'Label', 'Sample Rate'].
        scale_audio (bool): Whether to scale audio amplitude.

    Returns:
        dict: Feature matrices for MFCC, Pitch, Energy, ZCR.
        np.ndarray: Labels for each chunk.
    """
    mfcc_features, pitch_features, energy_features, zcr_features, labels = [], [], [], [], []

    for chunk in tqdm(audio_chunks):
        audio_data, label, sr = chunk["Chunk Data"], chunk["Label"], chunk["Sample Rate"]

        if scale_audio:
            audio_data = audio_data / np.max(np.abs(audio_data))

        # Feature 1: MFCC
        mfcc = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13).mean(axis=1)
        mfcc_features.append(mfcc)

        # Feature 2: Pitch
        pitch, _, _ = librosa.pyin(audio_data, fmin=80, fmax=450, sr=sr)
        pitch_mean = np.nanmean(pitch) if np.any(~np.isnan(pitch)) else 0
        pitch_std = np.nanstd(pitch) if np.any(~np.isnan(pitch)) else 0
        pitch_features.append([pitch_mean, pitch_std])

        # Feature 3: Energy
        rms = np.mean(librosa.feature.rms(y=audio_data))
        energy_features.append([rms])

        # Feature 4: Zero-Crossing Rate
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio_data))
        zcr_features.append([zcr])

        labels.append(1 if label == 'deceptive_story' else 0)

    return {
        'MFCC': np.array(mfcc_features),
        'Pitch': np.array(pitch_features),
        'Energy': np.array(energy_features),
        'ZCR': np.array(zcr_features)
    }, np.array(labels)


In [None]:
# Extract features and labels
features, y = extract_features_and_labels(audio_chunks, scale_audio=True)
#14.42s


100%|██████████| 420/420 [14:42<00:00,  2.10s/it]

MFCC Feature Matrix Shape: (420, 13)
Pitch Feature Matrix Shape: (420, 2)
Energy Feature Matrix Shape: (420, 1)
ZCR Feature Matrix Shape: (420, 1)
Label Vector Shape: (420,)





In [16]:
import numpy as np

# Unpack individual feature matrices
X_mfcc = features['MFCC']
X_pitch = features['Pitch']
X_energy = features['Energy']
X_zcr = features['ZCR']
X_all_features = np.hstack((features['MFCC'], features['Pitch'], features['Energy'], features['ZCR']))

print(f"MFCC Feature Matrix Shape: {X_mfcc.shape}")
print(f"Pitch Feature Matrix Shape: {X_pitch.shape}")
print(f"Energy Feature Matrix Shape: {X_energy.shape}")
print(f"ZCR Feature Matrix Shape: {X_zcr.shape}\n")
print(f"Combined Feature Matrix Shape: {X_all_features.shape}")
print(f"Label Vector Shape: {y.shape}")

MFCC Feature Matrix Shape: (420, 13)
Pitch Feature Matrix Shape: (420, 2)
Energy Feature Matrix Shape: (420, 1)
ZCR Feature Matrix Shape: (420, 1)

Combined Feature Matrix Shape: (420, 17)
Label Vector Shape: (420,)
