In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from tqdm import tqdm

# Path to the base directory
base_path = './MLEnd/deception/MLEndDD_stories_small/'

# Load the CSV file and ensure labels are accessible
MLEND_df = pd.read_csv('./MLEnd/deception/MLEndDD_story_attributes_small.csv').set_index('filename')

# Create a list of full file paths using the CSV index
files = [base_path + file for file in MLEND_df.index]

# Check the number of files and preview the dataset
print(f"We have {len(files)} audio files in the dataset.")
print("Preview of the dataset:")
display(MLEND_df.head())


We have 100 audio files in the dataset.
Preview of the dataset:


Unnamed: 0_level_0,Language,Story_type
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
00001.wav,Hindi,deceptive_story
00002.wav,English,true_story
00003.wav,English,deceptive_story
00004.wav,Bengali,deceptive_story
00005.wav,English,deceptive_story


In [2]:
import librosa
import pandas as pd
from tqdm import tqdm

def split_audio_into_chunks(file_id, file_path, label, chunk_duration=30, sr=None):
    """
    Splits an audio file into 30-second chunks and discards chunks shorter than 30 seconds.

    Args:
        file_id (str): The file ID (original file name).
        file_path (str): Path to the audio file.
        label (str): Label for the audio file (e.g., 'true_story' or 'deceptive_story').
        chunk_duration (int): Duration of each chunk in seconds (default: 30).
        sr (int or None): Sampling rate. If None, uses the original rate.

    Returns:
        tuple: File metadata (duration, number of valid chunks, label) and a list of chunk data with IDs.
    """
    y, sr = librosa.load(file_path, sr=sr)  # Load audio
    duration = len(y) / sr  # Calculate total duration in seconds
    chunk_size = chunk_duration * sr       # Calculate chunk size in samples
    chunks = [y[i:i + int(chunk_size)] for i in range(0, len(y), int(chunk_size))]

    # Discard chunks shorter than the desired length
    valid_chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]

    # Prepare metadata and chunks with Chunk ID
    metadata = {"File ID": file_id, "Duration (s)": duration, "Number of Chunks": len(valid_chunks), "Label": label}
    chunk_data = [(f"{file_id}_chunk{i+1}", chunk, label, file_id) for i, chunk in enumerate(valid_chunks)]
    
    return metadata, chunk_data

# Initialize lists for metadata and chunks
file_metadata = []
audio_chunks = []

# Iterate over all files and process
for file_id in tqdm(MLEND_df.index):  # Iterate over file IDs in the CSV
    file_path = base_path + file_id   # Construct full file path
    label = MLEND_df.loc[file_id, 'Story_type']  # Retrieve label from the CSV

    # Split into chunks and collect metadata
    metadata, chunks = split_audio_into_chunks(file_id, file_path, label)
    file_metadata.append(metadata)  # Collect metadata for the file
    audio_chunks.extend(chunks)     # Collect valid chunks

# Convert metadata to a DataFrame for easy viewing
metadata_df = pd.DataFrame(file_metadata)

# Display metadata
print("Summary of Audio Files:")
display(metadata_df)

# Prepare a DataFrame for audio chunks
chunks_df = pd.DataFrame(audio_chunks, columns=["Chunk ID", "Chunk Data", "Label", "File ID"])

# Display chunk summary
print("Summary of Audio Chunks:")
display(chunks_df[["Chunk ID", "Label", "File ID"]])

# Output the total number of valid chunks
print(f"Total valid audio chunks created: {len(audio_chunks)}")


100%|██████████| 100/100 [00:05<00:00, 16.97it/s]

Summary of Audio Files:





Unnamed: 0,File ID,Duration (s),Number of Chunks,Label
0,00001.wav,122.167256,4,deceptive_story
1,00002.wav,125.192018,4,true_story
2,00003.wav,162.984127,5,deceptive_story
3,00004.wav,121.681270,4,deceptive_story
4,00005.wav,134.189751,4,deceptive_story
...,...,...,...,...
95,00096.wav,111.512063,3,deceptive_story
96,00097.wav,185.731224,6,true_story
97,00098.wav,128.252766,4,deceptive_story
98,00099.wav,132.412562,4,true_story


Summary of Audio Chunks:


Unnamed: 0,Chunk ID,Label,File ID
0,00001.wav_chunk1,deceptive_story,00001.wav
1,00001.wav_chunk2,deceptive_story,00001.wav
2,00001.wav_chunk3,deceptive_story,00001.wav
3,00001.wav_chunk4,deceptive_story,00001.wav
4,00002.wav_chunk1,true_story,00002.wav
...,...,...,...
415,00099.wav_chunk4,true_story,00099.wav
416,00100.wav_chunk1,deceptive_story,00100.wav
417,00100.wav_chunk2,deceptive_story,00100.wav
418,00100.wav_chunk3,deceptive_story,00100.wav


Total valid audio chunks created: 420


In [3]:
chunks_df

Unnamed: 0,Chunk ID,Chunk Data,Label,File ID
0,00001.wav_chunk1,"[1.5258789e-05, 1.5258789e-05, 3.0517578e-05, ...",deceptive_story,00001.wav
1,00001.wav_chunk2,"[0.027450562, 0.026519775, 0.025390625, 0.0242...",deceptive_story,00001.wav
2,00001.wav_chunk3,"[-0.00091552734, -0.0011138916, -0.0013122559,...",deceptive_story,00001.wav
3,00001.wav_chunk4,"[6.1035156e-05, 9.1552734e-05, 7.6293945e-05, ...",deceptive_story,00001.wav
4,00002.wav_chunk1,"[0.0008239746, 0.0008239746, 0.00088500977, 0....",true_story,00002.wav
...,...,...,...,...
415,00099.wav_chunk4,"[-3.0517578e-05, -3.0517578e-05, -3.0517578e-0...",true_story,00099.wav
416,00100.wav_chunk1,"[-0.00018310547, -0.00015258789, -6.1035156e-0...",deceptive_story,00100.wav
417,00100.wav_chunk2,"[0.0004272461, 0.00048828125, 0.0005187988, 0....",deceptive_story,00100.wav
418,00100.wav_chunk3,"[6.1035156e-05, 0.0, -6.1035156e-05, -6.103515...",deceptive_story,00100.wav


In [4]:
def extract_features_and_labels(chunks_df, n_mfcc=13, sr=16000, scale_audio=True):
    """
    Extract features (MFCC, Pitch, Energy, ZCR) and associate labels for 30-second audio chunks.

    Args:
        chunks_df (DataFrame): DataFrame containing audio chunks and metadata.
        n_mfcc (int): Number of MFCC coefficients to extract.
        sr (int): Sampling rate for feature extraction.
        scale_audio (bool): Whether to scale audio amplitude.

    Returns:
        np.ndarray: Feature matrix (X) with extracted features for each chunk.
        np.ndarray: Label vector (y) corresponding to each chunk.
    """
    X, y = [], []

    for _, row in tqdm(chunks_df.iterrows(), total=len(chunks_df)):
        chunk = row["Chunk Data"]
        label = row["Label"]

        # Scale the audio if required
        if scale_audio:
            chunk = chunk / np.max(np.abs(chunk)) if np.max(np.abs(chunk)) > 0 else chunk

        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=n_mfcc).mean(axis=1)

        # Extract Pitch features
        pitch, _, _ = librosa.pyin(chunk, fmin=80, fmax=450, sr=sr)
        pitch_mean = np.nanmean(pitch) if np.mean(np.isnan(pitch)) < 1 else 0
        pitch_std = np.nanstd(pitch) if np.mean(np.isnan(pitch)) < 1 else 0

        # Compute Energy (RMS)
        rms = np.mean(librosa.feature.rms(y=chunk))

        # Compute Zero-Crossing Rate (ZCR)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=chunk))

        # Combine all features into a single feature vector
        xi = np.concatenate([mfcc, [pitch_mean, pitch_std, rms, zcr]])
        X.append(xi)

        # Append the corresponding label
        y.append(1 if label == 'true_story' else 0)  # Binary encode labels

    return np.array(X), np.array(y)

# Perform feature extraction
X, y = extract_features_and_labels(chunks_df, n_mfcc=13, sr=16000, scale_audio=True)

# Print the shapes of the feature matrix and label vector
print(f"Feature matrix shape: {X.shape}")  # Rows: chunks, Columns: features
print(f"Label vector shape: {y.shape}")  # Labels for each chunk


100%|██████████| 420/420 [12:26<00:00,  1.78s/it]

Feature matrix shape: (420, 17)
Label vector shape: (420,)





In [None]:
from sklearn.model_selection import train_test_split

# Perform data splitting
def split_data(X, y):
    """
    Splits the data into training, validation, and test sets.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Label vector.

    Returns:
        tuple: Split datasets (X_train, X_val, X_test, y_train, y_val, y_test).
    """
    # Split the data into training+validation (80%) and test (20%) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Further split training+validation into train (75%) and validation (25%) sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42
    )

    return X_train, X_val, X_test, y_train, y_val, y_test


# Perform the split
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Output the shapes of the splits
print(f"Training set size: {X_train.shape}, Labels: {y_train.shape}")
print(f"Validation set size: {X_val.shape}, Labels: {y_val.shape}")
print(f"Test set size: {X_test.shape}, Labels: {y_test.shape}")


Training set size: (252, 17), Labels: (252,)
Validation set size: (84, 17), Labels: (84,)
Test set size: (84, 17), Labels: (84,)


In [31]:
from collections import Counter

def check_label_distribution(labels, split_name):
    """
    Checks and prints the distribution of labels.

    Args:
        labels (np.ndarray): Array of labels.
        split_name (str): Name of the dataset split (e.g., "Training").
    """
    label_counts = Counter(labels)
    total = sum(label_counts.values())
    print(f"\nLabel distribution in {split_name} set:")
    for label, count in label_counts.items():
        percentage = (count / total) * 100
        print(f"  {label}: {count} ({percentage:.2f}%)")

# Check distributions
check_label_distribution(y_train, "Training")
check_label_distribution(y_val, "Validation")
check_label_distribution(y_test, "Test")


Label distribution in Training set:
  0: 131 (51.98%)
  1: 121 (48.02%)

Label distribution in Validation set:
  0: 44 (52.38%)
  1: 40 (47.62%)

Label distribution in Test set:
  0: 44 (52.38%)
  1: 40 (47.62%)


In [9]:
import numpy as np
import librosa
from tqdm import tqdm

def extract_individual_features_and_labels(audio_chunks, sr=22050, scale_audio=True):
    """
    Extract individual features (MFCC, Pitch, Energy, ZCR) and associate labels for 30-second audio chunks.

    Args:
        audio_chunks (list): List of tuples [(chunk_id, chunk, label, file_id)].
        sr (int): Sampling rate for feature extraction.
        scale_audio (bool): Whether to scale audio amplitude.

    Returns:
        dict: Feature matrices for each feature type (MFCC, Pitch, Energy, ZCR).
        np.ndarray: Label vector corresponding to each chunk.
    """
    mfcc_features, pitch_features, energy_features, zcr_features = [], [], [], []
    labels = []

    for chunk_id, chunk, label, file_id in tqdm(audio_chunks):
        # Scale the audio if required
        if scale_audio:
            chunk = chunk / np.max(np.abs(chunk)) if np.max(np.abs(chunk)) > 0 else chunk

        # Extract MFCC features
        n_mfcc = 13
        mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=n_mfcc).mean(axis=1)

        # Extract Pitch features
        try:
            pitch, _, _ = librosa.pyin(chunk, fmin=80, fmax=450, sr=sr)
            pitch_mean = np.nanmean(pitch) if np.mean(np.isnan(pitch)) < 1 else 0
            pitch_std = np.nanstd(pitch) if np.mean(np.isnan(pitch)) < 1 else 0
        except librosa.util.exceptions.ParameterError:
            pitch_mean, pitch_std = 0, 0

        # Compute Energy (RMS)
        rms = np.mean(librosa.feature.rms(y=chunk))

        # Compute Zero-Crossing Rate (ZCR)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=chunk))

        # Append individual feature vectors
        mfcc_features.append(mfcc)
        pitch_features.append([pitch_mean, pitch_std])
        energy_features.append([rms])
        zcr_features.append([zcr])

        # Append the corresponding label
        labels.append(1 if label == 'deceptive_story' else 0)  # Binary encode labels

    # Convert features and labels to numpy arrays
    return {
        'MFCC': np.array(mfcc_features),
        'Pitch': np.array(pitch_features),
        'Energy': np.array(energy_features),
        'ZCR': np.array(zcr_features)
    }, np.array(labels)


In [10]:
# Extract features and labels separately for each feature type
features, y = extract_individual_features_and_labels(audio_chunks, sr=22050)

# Unpack individual feature matrices
X_mfcc = features['MFCC']
X_pitch = features['Pitch']
X_energy = features['Energy']
X_zcr = features['ZCR']

# Display shapes for debugging
print(f"MFCC Feature Matrix Shape: {X_mfcc.shape}")
print(f"Pitch Feature Matrix Shape: {X_pitch.shape}")
print(f"Energy Feature Matrix Shape: {X_energy.shape}")
print(f"ZCR Feature Matrix Shape: {X_zcr.shape}")
print(f"Label Vector Shape: {y.shape}")


100%|██████████| 420/420 [12:02<00:00,  1.72s/it]

MFCC Feature Matrix Shape: (420, 13)
Pitch Feature Matrix Shape: (420, 2)
Energy Feature Matrix Shape: (420, 1)
ZCR Feature Matrix Shape: (420, 1)
Label Vector Shape: (420,)





In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_nested_train_valid_test(X, y, test_size=0.2, valid_ratio=0.25, random_state=42):
    """
    Splits data into train-validation and test sets, then further splits train-validation into training and validation sets.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Labels.
        test_size (float): Proportion of data to be used as test set.
        valid_ratio (float): Proportion of the train-validation set to be used for validation.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Train, validation, and test sets with normalized features and labels.
    """
    # Split into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # Split train-validation into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=valid_ratio, stratify=y_train_val, random_state=random_state
    )

    # Normalize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test


In [14]:
# Split each feature type into train-validation and test sets
X_mfcc_train, X_mfcc_val, X_mfcc_test, y_train, y_val, y_test = prepare_nested_train_valid_test(X_mfcc, y)
X_pitch_train, X_pitch_val, X_pitch_test, _, _, _ = prepare_nested_train_valid_test(X_pitch, y)
X_energy_train, X_energy_val, X_energy_test, _, _, _ = prepare_nested_train_valid_test(X_energy, y)
X_zcr_train, X_zcr_val, X_zcr_test, _, _, _ = prepare_nested_train_valid_test(X_zcr, y)

# Confirm the sizes of the splits
print(f"MFCC Train Shape: {X_mfcc_train.shape}, Validation Shape: {X_mfcc_val.shape}, Test Shape: {X_mfcc_test.shape}")
print(f"Pitch Train Shape: {X_pitch_train.shape}, Validation Shape: {X_pitch_val.shape}, Test Shape: {X_pitch_test.shape}")
print(f"Energy Train Shape: {X_energy_train.shape}, Validation Shape: {X_energy_val.shape}, Test Shape: {X_energy_test.shape}")
print(f"ZCR Train Shape: {X_zcr_train.shape}, Validation Shape: {X_zcr_val.shape}, Test Shape: {X_zcr_test.shape}")
print(f"Labels Train: {len(y_train)}, Validation: {len(y_val)}, Test: {len(y_test)}")


MFCC Train Shape: (252, 13), Validation Shape: (84, 13), Test Shape: (84, 13)
Pitch Train Shape: (252, 2), Validation Shape: (84, 2), Test Shape: (84, 2)
Energy Train Shape: (252, 1), Validation Shape: (84, 1), Test Shape: (84, 1)
ZCR Train Shape: (252, 1), Validation Shape: (84, 1), Test Shape: (84, 1)
Labels Train: 252, Validation: 84, Test: 84


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

# Function to train and evaluate models on validation set
def train_and_evaluate_on_validation(feature_name, X_train, X_val, y_train, y_val):
    """
    Train and evaluate models using validation set.

    Args:
        feature_name (str): Name of the feature (e.g., 'MFCC').
        X_train (np.ndarray): Training feature set.
        X_val (np.ndarray): Validation feature set.
        y_train (np.ndarray): Training labels.
        y_val (np.ndarray): Validation labels.

    Returns:
        dict: Validation F1 score and accuracy for each model.
    """
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'SVM': SVC(kernel='linear', random_state=42)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)  # Train the model
        y_pred = model.predict(X_val)  # Predict on validation set
        f1 = f1_score(y_val, y_pred)  # F1 score
        acc = accuracy_score(y_val, y_pred)  # Accuracy
        results[name] = {'F1 Score': f1, 'Accuracy': acc}

    # Print results
    print(f"\nPerformance on {feature_name} Features (Validation Set):")
    for model, metrics in results.items():
        print(f"  {model}: F1 Score = {metrics['F1 Score']:.2f}, Accuracy = {metrics['Accuracy']:.2f}")

    return results

# Train and evaluate models on each feature type
mfcc_results = train_and_evaluate_on_validation('MFCC', X_mfcc_train, X_mfcc_val, y_train, y_val)
pitch_results = train_and_evaluate_on_validation('Pitch', X_pitch_train, X_pitch_val, y_train, y_val)
energy_results = train_and_evaluate_on_validation('Energy', X_energy_train, X_energy_val, y_train, y_val)
zcr_results = train_and_evaluate_on_validation('ZCR', X_zcr_train, X_zcr_val, y_train, y_val)



Performance on MFCC Features (Validation Set):
  Logistic Regression: F1 Score = 0.46, Accuracy = 0.52
  Decision Tree: F1 Score = 0.68, Accuracy = 0.70
  SVM: F1 Score = 0.57, Accuracy = 0.61

Performance on Pitch Features (Validation Set):
  Logistic Regression: F1 Score = 0.12, Accuracy = 0.50
  Decision Tree: F1 Score = 0.55, Accuracy = 0.56
  SVM: F1 Score = 0.14, Accuracy = 0.55

Performance on Energy Features (Validation Set):
  Logistic Regression: F1 Score = 0.00, Accuracy = 0.52
  Decision Tree: F1 Score = 0.47, Accuracy = 0.46
  SVM: F1 Score = 0.00, Accuracy = 0.52

Performance on ZCR Features (Validation Set):
  Logistic Regression: F1 Score = 0.26, Accuracy = 0.54
  Decision Tree: F1 Score = 0.53, Accuracy = 0.58
  SVM: F1 Score = 0.05, Accuracy = 0.54


In [39]:
# Train and evaluate models on each feature type in the specified order
mfcc_results = train_and_evaluate_on_validation('MFCC', X_mfcc_train, X_mfcc_val, y_train, y_val)
pitch_results = train_and_evaluate_on_validation('Pitch', X_pitch_train, X_pitch_val, y_train, y_val)
energy_results = train_and_evaluate_on_validation('Energy', X_energy_train, X_energy_val, y_train, y_val)
zcr_results = train_and_evaluate_on_validation('ZCR', X_zcr_train, X_zcr_val, y_train, y_val)

In [50]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

def train_and_evaluate_svm(feature_name, X_train, X_val, y_train, y_val):
    """
    Train and evaluate SVM using training and validation sets for both linear and RBF kernels.
    """
    results = {}

    # SVM with Linear Kernel
    linear_model = SVC(kernel='linear', random_state=42)
    linear_model.fit(X_train, y_train)  # Train the model
    y_train_pred_linear = linear_model.predict(X_train)
    y_val_pred_linear = linear_model.predict(X_val)

    # Linear Kernel Metrics
    train_acc_linear = accuracy_score(y_train, y_train_pred_linear)
    val_acc_linear = accuracy_score(y_val, y_val_pred_linear)
    train_f1_linear = f1_score(y_train, y_train_pred_linear)
    val_f1_linear = f1_score(y_val, y_val_pred_linear)

    results['Linear Kernel'] = {
        'Training Accuracy': train_acc_linear,
        'Validation Accuracy': val_acc_linear,
        'Training F1 Score': train_f1_linear,
        'Validation F1 Score': val_f1_linear
    }

    print(f"\nSVM (Linear Kernel) Performance on {feature_name} Features:")
    print(f"  Accuracy -> Training: {train_acc_linear:.2f}, Validation: {val_acc_linear:.2f}")
    print(f"  F1 Score -> Training: {train_f1_linear:.2f}, Validation: {val_f1_linear:.2f}")

    # SVM with RBF Kernel
    rbf_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
    rbf_model.fit(X_train, y_train)  # Train the model
    y_train_pred_rbf = rbf_model.predict(X_train)
    y_val_pred_rbf = rbf_model.predict(X_val)

    # RBF Kernel Metrics
    train_acc_rbf = accuracy_score(y_train, y_train_pred_rbf)
    val_acc_rbf = accuracy_score(y_val, y_val_pred_rbf)
    train_f1_rbf = f1_score(y_train, y_train_pred_rbf)
    val_f1_rbf = f1_score(y_val, y_val_pred_rbf)

    results['RBF Kernel'] = {
        'Training Accuracy': train_acc_rbf,
        'Validation Accuracy': val_acc_rbf,
        'Training F1 Score': train_f1_rbf,
        'Validation F1 Score': val_f1_rbf
    }

    print(f"\nSVM (RBF Kernel, C=1, Gamma='scale') Performance on {feature_name} Features:")
    print(f"  Accuracy -> Training: {train_acc_rbf:.2f}, Validation: {val_acc_rbf:.2f}")
    print(f"  F1 Score -> Training: {train_f1_rbf:.2f}, Validation: {val_f1_rbf:.2f}")

    return results


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

def train_and_evaluate_logistic_regression(feature_name, X_train, X_val, y_train, y_val):
    """
    Train and evaluate Logistic Regression using training and validation sets.
    """
    model = LogisticRegression(max_iter=1000,random_state=42)
    model.fit(X_train, y_train)  # Train the model
    
    # Predictions for train and validation sets
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Accuracy scores
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    
    # F1 scores
    train_f1 = f1_score(y_train, y_train_pred)
    val_f1 = f1_score(y_val, y_val_pred)

    print(f"\nLogistic Regression Performance on {feature_name} Features:")
    print(f"  Accuracy -> Training: {train_acc:.2f}, Validation: {val_acc:.2f}")
    print(f"  F1 Score -> Training: {train_f1:.2f}, Validation: {val_f1:.2f}")

    return {
        'Training Accuracy': train_acc, 
        'Validation Accuracy': val_acc,
        'Training F1 Score': train_f1, 
        'Validation F1 Score': val_f1
    }


In [52]:
from sklearn.tree import DecisionTreeClassifier

def train_and_evaluate_decision_tree(feature_name, X_train, X_val, y_train, y_val):
    """
    Train and evaluate Decision Tree using training and validation sets.
    """
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)  # Train the model
    
    # Predictions for train and validation sets
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Accuracy scores
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    
    # F1 scores
    train_f1 = f1_score(y_train, y_train_pred)
    val_f1 = f1_score(y_val, y_val_pred)

    print(f"\nDecision Tree Performance on {feature_name} Features:")
    print(f"  Accuracy -> Training: {train_acc:.2f}, Validation: {val_acc:.2f}")
    print(f"  F1 Score -> Training: {train_f1:.2f}, Validation: {val_f1:.2f}")

    return {
        'Training Accuracy': train_acc, 
        'Validation Accuracy': val_acc,
        'Training F1 Score': train_f1, 
        'Validation F1 Score': val_f1
    }


In [59]:
# Evaluate SVM for each feature
print("\nEvaluating SVM...")
mfcc_svm_results = train_and_evaluate_svm('MFCC', X_mfcc_train, X_mfcc_val, y_train, y_val)
pitch_svm_results = train_and_evaluate_svm('Pitch', X_pitch_train, X_pitch_val, y_train, y_val)
energy_svm_results = train_and_evaluate_svm('Energy', X_energy_train, X_energy_val, y_train, y_val)
zcr_svm_results = train_and_evaluate_svm('ZCR', X_zcr_train, X_zcr_val, y_train, y_val)

# Logistic Regression
print("\nEvaluating Logistic Regression...")
mfcc_lr_results = train_and_evaluate_logistic_regression('MFCC', X_mfcc_train, X_mfcc_val, y_train, y_val)
pitch_lr_results = train_and_evaluate_logistic_regression('Pitch', X_pitch_train, X_pitch_val, y_train, y_val)
energy_lr_results = train_and_evaluate_logistic_regression('Energy', X_energy_train, X_energy_val, y_train, y_val)
zcr_lr_results = train_and_evaluate_logistic_regression('ZCR', X_zcr_train, X_zcr_val, y_train, y_val)

# Decision Tree
print("\nEvaluating Decision Tree...")
mfcc_dt_results = train_and_evaluate_decision_tree('MFCC', X_mfcc_train, X_mfcc_val, y_train, y_val)
pitch_dt_results = train_and_evaluate_decision_tree('Pitch', X_pitch_train, X_pitch_val, y_train, y_val)
energy_dt_results = train_and_evaluate_decision_tree('Energy', X_energy_train, X_energy_val, y_train, y_val)
zcr_dt_results = train_and_evaluate_decision_tree('ZCR', X_zcr_train, X_zcr_val, y_train, y_val)


Evaluating SVM...

SVM (Linear Kernel) Performance on MFCC Features:
  Accuracy -> Training: 0.66, Validation: 0.61
  F1 Score -> Training: 0.65, Validation: 0.57

SVM (RBF Kernel, C=1, Gamma='scale') Performance on MFCC Features:
  Accuracy -> Training: 0.94, Validation: 0.73
  F1 Score -> Training: 0.93, Validation: 0.69

SVM (Linear Kernel) Performance on Pitch Features:
  Accuracy -> Training: 0.53, Validation: 0.55
  F1 Score -> Training: 0.12, Validation: 0.14

SVM (RBF Kernel, C=1, Gamma='scale') Performance on Pitch Features:
  Accuracy -> Training: 0.66, Validation: 0.40
  F1 Score -> Training: 0.57, Validation: 0.17

SVM (Linear Kernel) Performance on Energy Features:
  Accuracy -> Training: 0.52, Validation: 0.52
  F1 Score -> Training: 0.00, Validation: 0.00

SVM (RBF Kernel, C=1, Gamma='scale') Performance on Energy Features:
  Accuracy -> Training: 0.55, Validation: 0.51
  F1 Score -> Training: 0.16, Validation: 0.05

SVM (Linear Kernel) Performance on ZCR Features:
  Ac

In [None]:
# import pandas as pd

# # Collect results for all features
# all_results = {
#     'MFCC': mfcc_results,
#     'Pitch': pitch_results,
#     'Energy': energy_results,
#     'ZCR': zcr_results
# }

# # Prepare data for a multi-level column structure
# data = []
# models = list(all_results['MFCC'].keys())  # Assume all feature results have the same models

# for model in models:
#     row = []
#     for feature, results in all_results.items():
#         metrics = results[model]
#         row.extend([metrics['Training Accuracy'], metrics['Validation Accuracy']])
#     data.append(row)

# # Define multi-level column names
# columns = pd.MultiIndex.from_product(
#     [all_results.keys(), ['Training Accuracy', 'Validation Accuracy']],
#     names=['Feature', 'Metric']
# )

# # Create the DataFrame
# summary_df = pd.DataFrame(data, index=models, columns=columns)

# # Display the summary table
# print("\nTraining and Validation Results Across Features:")
# display(summary_df)


In [35]:
# # Prepare data for F1 scores
# f1_data = []

# for model in models:  # Use the same models list
#     row = []
#     for feature, results in all_results.items():
#         metrics = results[model]
#         row.extend([metrics['Training F1 Score'], metrics['Validation F1 Score']])
#     f1_data.append(row)

# # Define multi-level column names for F1 Scores
# f1_columns = pd.MultiIndex.from_product(
#     [all_results.keys(), ['Training F1 Score', 'Validation F1 Score']],
#     names=['Feature', 'Metric']
# )

# # Create the DataFrame for F1 Scores
# f1_summary_df = pd.DataFrame(f1_data, index=models, columns=f1_columns)

# # Display the F1 score summary table
# print("\nTraining and Validation F1 Scores Across Features:")
# display(f1_summary_df)


In [64]:
import pandas as pd

# Collect results for all features
all_results = {
    'MFCC': mfcc_results,
    'Pitch': pitch_results,
    'Energy': energy_results,
    'ZCR': zcr_results
}

# Prepare data for a multi-level column structure
data = []
models = list(all_results['MFCC'].keys())

# Prepare data for combined table
combined_data = []

for model in models:
    row = []
    for feature, results in all_results.items():
        metrics = results[model]
        # Append Accuracy and F1 Score metrics (Training and Validation)
        row.extend([
            metrics['Training Accuracy'], metrics['Validation Accuracy'],  # Accuracy
            metrics['Training F1 Score'], metrics['Validation F1 Score']   # F1 Score
        ])
    combined_data.append(row)

# Define multi-level column names
combined_columns = pd.MultiIndex.from_product(
    [all_results.keys(), ['Accuracy', 'F1 Score'], ['Training', 'Validation']],
    names=['Feature', 'Metric', 'Set']
)

# Create the DataFrame
combined_summary_df = pd.DataFrame(combined_data, index=models, columns=combined_columns)

# Display the combined summary table
print("\nCombined Training and Validation Metrics (Accuracy and F1 Scores) Across Features:")
display(combined_summary_df)



Combined Training and Validation Metrics (Accuracy and F1 Scores) Across Features:


Feature,MFCC,MFCC,MFCC,MFCC,Pitch,Pitch,Pitch,Pitch,Energy,Energy,Energy,Energy,ZCR,ZCR,ZCR,ZCR
Metric,Accuracy,Accuracy,F1 Score,F1 Score,Accuracy,Accuracy,F1 Score,F1 Score,Accuracy,Accuracy,F1 Score,F1 Score,Accuracy,Accuracy,F1 Score,F1 Score
Set,Training,Validation,Training,Validation,Training,Validation,Training,Validation,Training,Validation,Training,Validation,Training,Validation,Training,Validation
SVM (Linear Kernel),0.65873,0.607143,0.653226,0.571429,0.527778,0.547619,0.118519,0.136364,0.519841,0.52381,0.0,0.0,0.519841,0.535714,0.047244,0.04878
SVM (RBF Kernel),0.936508,0.72619,0.933333,0.693333,0.662698,0.404762,0.572864,0.166667,0.547619,0.511905,0.161765,0.046512,0.575397,0.619048,0.40884,0.428571
Logistic Regression,0.638889,0.52381,0.619247,0.459459,0.563492,0.5,0.294872,0.125,0.52381,0.52381,0.016393,0.0,0.547619,0.535714,0.366667,0.264151
Decision Tree,0.81746,0.630952,0.839161,0.673684,0.742063,0.547619,0.73251,0.5,0.638889,0.5,0.616034,0.487805,0.626984,0.607143,0.591304,0.507463


In [79]:
# Find the best model for each feature based on a combination of Accuracy and F1 Score
best_models = {}
for feature in all_results.keys():
    # Extract validation Accuracy and F1 Scores for all models for the current feature
    validation_accuracy = combined_summary_df[feature, 'Accuracy', 'Validation']
    validation_f1_scores = combined_summary_df[feature, 'F1 Score', 'Validation']

    # Calculate a combined score (e.g., average of Accuracy and F1 Score)
    combined_scores = (validation_accuracy + validation_f1_scores) / 2

    # Find the model with the highest combined score
    best_model = combined_scores.idxmax()
    best_combined_score = combined_scores.max()
    best_accuracy = validation_accuracy[best_model]
    best_f1_score = validation_f1_scores[best_model]

    # Store results
    best_models[feature] = {
        'Model': best_model,
        'Validation Accuracy': best_accuracy,
        'Validation F1 Score': best_f1_score,
        'Combined Score': best_combined_score
    }

# Display the best models for each feature
print("\nBest Models for Each Feature (Considering Accuracy and F1 Score):")
for feature, info in best_models.items():
    print(f"{feature}: Best Model = {info['Model']}, "
          f"Validation Accuracy = {info['Validation Accuracy']:.2f}, "
          f"Validation F1 Score = {info['Validation F1 Score']:.2f}, "
          f"Combined Score = {info['Combined Score']:.2f}")



Best Models for Each Feature (Considering Accuracy and F1 Score):
MFCC: Best Model = SVM (RBF Kernel), Validation Accuracy = 0.73, Validation F1 Score = 0.69, Combined Score = 0.71
Pitch: Best Model = Decision Tree, Validation Accuracy = 0.55, Validation F1 Score = 0.50, Combined Score = 0.52
Energy: Best Model = Decision Tree, Validation Accuracy = 0.50, Validation F1 Score = 0.49, Combined Score = 0.49
ZCR: Best Model = Decision Tree, Validation Accuracy = 0.61, Validation F1 Score = 0.51, Combined Score = 0.56


In [88]:
# Train and store models for each feature
models_mfcc = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000).fit(X_mfcc_train, y_train),
    'Decision Tree': DecisionTreeClassifier(random_state=42).fit(X_mfcc_train, y_train),
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42).fit(X_mfcc_train, y_train),
    'SVM (RBF Kernel)': SVC(kernel='rbf', C=1, gamma='scale', random_state=42).fit(X_mfcc_train, y_train)
}

models_pitch = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000).fit(X_pitch_train, y_train),
    'Decision Tree': DecisionTreeClassifier(random_state=42).fit(X_pitch_train, y_train),
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42).fit(X_pitch_train, y_train),
    'SVM (RBF Kernel)': SVC(kernel='rbf', C=1, gamma='scale', random_state=42).fit(X_pitch_train, y_train)
}

models_energy = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000).fit(X_energy_train, y_train),
    'Decision Tree': DecisionTreeClassifier(random_state=42).fit(X_energy_train, y_train),
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42).fit(X_energy_train, y_train),
    'SVM (RBF Kernel)': SVC(kernel='rbf', C=1, gamma='scale', random_state=42).fit(X_energy_train, y_train)
}

models_zcr = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000).fit(X_zcr_train, y_train),
    'Decision Tree': DecisionTreeClassifier(random_state=42).fit(X_zcr_train, y_train),
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42).fit(X_zcr_train, y_train),
    'SVM (RBF Kernel)': SVC(kernel='rbf', C=1, gamma='scale', random_state=42).fit(X_zcr_train, y_train)
}


In [90]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_models(models, X_val, y_val, feature_name):
    """
    Evaluate all models for a specific feature on the validation set.

    Args:
        models (dict): Dictionary of trained models for the feature.
        X_val (np.ndarray): Validation features for the feature.
        y_val (np.ndarray): Validation labels.
        feature_name (str): Name of the feature being evaluated.

    Returns:
        dict: Performance metrics (Accuracy and F1 Score) for all models.
    """
    results = {}
    print(f"\nEvaluating Models for {feature_name} Feature:")
    for model_name, model in models.items():
        y_val_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        results[model_name] = {
            'Validation Accuracy': acc,
            'Validation F1 Score': f1
        }
        print(f"  {model_name}: Accuracy = {acc:.2f}, F1 Score = {f1:.2f}")
    return results

# Evaluate all models for each feature
mfcc_results = evaluate_models(models_mfcc, X_mfcc_val, y_val, 'MFCC')
pitch_results = evaluate_models(models_pitch, X_pitch_val, y_val, 'Pitch')
energy_results = evaluate_models(models_energy, X_energy_val, y_val, 'Energy')
zcr_results = evaluate_models(models_zcr, X_zcr_val, y_val, 'ZCR')



Evaluating Models for MFCC Feature:
  Logistic Regression: Accuracy = 0.52, F1 Score = 0.46
  Decision Tree: Accuracy = 0.70, F1 Score = 0.68
  SVM (Linear Kernel): Accuracy = 0.61, F1 Score = 0.57
  SVM (RBF Kernel): Accuracy = 0.73, F1 Score = 0.69

Evaluating Models for Pitch Feature:
  Logistic Regression: Accuracy = 0.50, F1 Score = 0.12
  Decision Tree: Accuracy = 0.56, F1 Score = 0.55
  SVM (Linear Kernel): Accuracy = 0.55, F1 Score = 0.14
  SVM (RBF Kernel): Accuracy = 0.40, F1 Score = 0.17

Evaluating Models for Energy Feature:
  Logistic Regression: Accuracy = 0.52, F1 Score = 0.00
  Decision Tree: Accuracy = 0.46, F1 Score = 0.47
  SVM (Linear Kernel): Accuracy = 0.52, F1 Score = 0.00
  SVM (RBF Kernel): Accuracy = 0.51, F1 Score = 0.05

Evaluating Models for ZCR Feature:
  Logistic Regression: Accuracy = 0.54, F1 Score = 0.26
  Decision Tree: Accuracy = 0.58, F1 Score = 0.53
  SVM (Linear Kernel): Accuracy = 0.54, F1 Score = 0.05
  SVM (RBF Kernel): Accuracy = 0.62, F1 Sco

In [91]:
# Function to find the best model based on Validation F1 Score
def find_best_model(results, feature_name):
    """
    Find the best model for a specific feature based on Validation F1 Score.

    Args:
        results (dict): Evaluation results for all models of a feature.
        feature_name (str): Name of the feature.

    Returns:
        dict: Best model details (name, metrics).
    """
    best_model_name = None
    best_metrics = None
    best_f1 = 0

    for model_name, metrics in results.items():
        if metrics['Validation F1 Score'] > best_f1:
            best_model_name = model_name
            best_metrics = metrics
            best_f1 = metrics['Validation F1 Score']

    print(f"\nBest Model for {feature_name} Feature: {best_model_name} "
          f"(Accuracy = {best_metrics['Validation Accuracy']:.2f}, "
          f"F1 Score = {best_metrics['Validation F1 Score']:.2f})")
    return {
        'Model Name': best_model_name,
        'Metrics': best_metrics
    }

# Identify the best model for each feature
best_models = {
    'MFCC': find_best_model(mfcc_results, 'MFCC'),
    'Pitch': find_best_model(pitch_results, 'Pitch'),
    'Energy': find_best_model(energy_results, 'Energy'),
    'ZCR': find_best_model(zcr_results, 'ZCR')
}



Best Model for MFCC Feature: SVM (RBF Kernel) (Accuracy = 0.73, F1 Score = 0.69)

Best Model for Pitch Feature: Decision Tree (Accuracy = 0.56, F1 Score = 0.55)

Best Model for Energy Feature: Decision Tree (Accuracy = 0.46, F1 Score = 0.47)

Best Model for ZCR Feature: Decision Tree (Accuracy = 0.58, F1 Score = 0.53)


In [92]:
import numpy as np

# Collect predictions from the best models for validation set
mfcc_preds = models_mfcc[best_models['MFCC']['Model Name']].predict(X_mfcc_val)
pitch_preds = models_pitch[best_models['Pitch']['Model Name']].predict(X_pitch_val)
energy_preds = models_energy[best_models['Energy']['Model Name']].predict(X_energy_val)
zcr_preds = models_zcr[best_models['ZCR']['Model Name']].predict(X_zcr_val)

# Combine predictions (Majority Voting)
all_preds = np.array([mfcc_preds, pitch_preds, energy_preds, zcr_preds])
ensemble_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_preds)

# Evaluate Ensemble
ensemble_accuracy = accuracy_score(y_val, ensemble_preds)
ensemble_f1 = f1_score(y_val, ensemble_preds)

print("\nEnsemble Performance (Majority Voting):")
print(f"  Accuracy: {ensemble_accuracy:.2f}")
print(f"  F1 Score: {ensemble_f1:.2f}")



Ensemble Performance (Majority Voting):
  Accuracy: 0.58
  F1 Score: 0.44


In [83]:
# Define models dictionary with actual trained models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000,random_state=42),  # Replace with your trained Logistic Regression model
    'Decision Tree':DecisionTreeClassifier(random_state=42),  # Replace with your trained Decision Tree model
    'SVM (Linear Kernel)': SVC(kernel='linear', random_state=42),  # Replace with your trained SVM (Linear Kernel) model
    'SVM (RBF Kernel)': SVC(kernel='rbf', C=1, gamma='scale', random_state=42) # Replace with your trained SVM (RBF Kernel) model
}

# Find the best model for each feature based on combined metrics
best_models = {}
for feature, results in all_results.items():
    best_model_name = None
    best_model_object = None
    best_combined_score = 0

    for model_name, metrics in results.items():
        # Calculate combined score (average of Accuracy and F1 Score)
        combined_score = (metrics['Validation Accuracy'] + metrics['Validation F1 Score']) / 2

        if combined_score > best_combined_score:
            best_combined_score = combined_score
            best_model_name = model_name
            best_model_object = models[model_name]  # Retrieve the trained model from the dictionary

    # Save the best model details
    best_models[feature] = {
        'Model Name': best_model_name,
        'Model': best_model_object,  # Store the actual trained model object
        'Validation Accuracy': results[best_model_name]['Validation Accuracy'],
        'Validation F1 Score': results[best_model_name]['Validation F1 Score'],
        'Combined Score': best_combined_score
    }

# Display the best models for each feature
print("\nBest Models for Each Feature (Considering Accuracy and F1 Score):")
for feature, info in best_models.items():
    print(f"{feature}: Best Model = {info['Model Name']}, "
          f"Validation Accuracy = {info['Validation Accuracy']:.2f}, "
          f"Validation F1 Score = {info['Validation F1 Score']:.2f}, "
          f"Combined Score = {info['Combined Score']:.2f}")



Best Models for Each Feature (Considering Accuracy and F1 Score):
MFCC: Best Model = SVM (RBF Kernel), Validation Accuracy = 0.73, Validation F1 Score = 0.69, Combined Score = 0.71
Pitch: Best Model = Decision Tree, Validation Accuracy = 0.55, Validation F1 Score = 0.50, Combined Score = 0.52
Energy: Best Model = Decision Tree, Validation Accuracy = 0.50, Validation F1 Score = 0.49, Combined Score = 0.49
ZCR: Best Model = Decision Tree, Validation Accuracy = 0.61, Validation F1 Score = 0.51, Combined Score = 0.56


In [65]:
from sklearn.ensemble import VotingClassifier

def majority_voting_ensemble(X_train, X_val, y_train, y_val, models):
    """
    Perform majority voting ensemble using VotingClassifier.

    Args:
        X_train (np.ndarray): Training feature set.
        X_val (np.ndarray): Validation feature set.
        y_train (np.ndarray): Training labels.
        y_val (np.ndarray): Validation labels.
        models (list of tuples): List of (name, model) pairs.

    Returns:
        dict: Accuracy and F1 Score of the ensemble on validation data.
    """
    # Create a VotingClassifier
    ensemble_model = VotingClassifier(estimators=models, voting='hard')
    ensemble_model.fit(X_train, y_train)  # Train the ensemble model

    # Predict on validation set
    y_val_pred = ensemble_model.predict(X_val)

    # Compute metrics
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)

    print("\nMajority Voting Ensemble Performance:")
    print(f"  Accuracy: {val_acc:.2f}")
    print(f"  F1 Score: {val_f1:.2f}")

    return {
        'Validation Accuracy': val_acc,
        'Validation F1 Score': val_f1
    }


In [67]:
# Prepare individual models for the ensemble
models = [
    ('lr', LogisticRegression(max_iter=1000,random_state=42)),  # Best logistic regression model
    ('dt', DecisionTreeClassifier(random_state=42)),  # Best decision tree model
    ('svm_linear', SVC(kernel='linear', random_state=42)),  # Best SVM (linear kernel)
    ('svm_rbf', SVC(kernel='rbf', C=1, gamma='scale', random_state=42))  # Best SVM (RBF kernel)
]

# Perform the ensemble
ensemble_results = majority_voting_ensemble(X_train, X_val, y_train, y_val, models)



Majority Voting Ensemble Performance:
  Accuracy: 0.49
  F1 Score: 0.43


In [70]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def feature_level_ensemble(predictions, y_val):
    """
    Perform feature-level ensemble using majority voting.

    Args:
        predictions (list of np.ndarray): List of predictions from models trained on different features.
        y_val (np.ndarray): Ground truth labels for the validation set.

    Returns:
        dict: Accuracy and F1 Score of the ensemble on validation data.
    """
    # Stack predictions and compute majority vote for each sample
    predictions_array = np.array(predictions)
    ensemble_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions_array)

    # Compute metrics
    val_acc = accuracy_score(y_val, ensemble_preds)
    val_f1 = f1_score(y_val, ensemble_preds)

    print("\nFeature-Level Ensemble Performance:")
    print(f"  Accuracy: {val_acc:.2f}")
    print(f"  F1 Score: {val_f1:.2f}")

    return {
        'Validation Accuracy': val_acc,
        'Validation F1 Score': val_f1
    }
