In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from tqdm import tqdm

# Path to the base directory
base_path = './MLEnd/deception/MLEndDD_stories_small/'

# Load the CSV file and ensure labels are accessible
MLEND_df = pd.read_csv('./MLEnd/deception/MLEndDD_story_attributes_small.csv').set_index('filename')

# Create a list of full file paths using the CSV index
files = [base_path + file for file in MLEND_df.index]

# Check the number of files and preview the dataset
print(f"We have {len(files)} audio files in the dataset.")
print("Preview of the dataset:")
display(MLEND_df.head())


We have 100 audio files in the dataset.
Preview of the dataset:


Unnamed: 0_level_0,Language,Story_type
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
00001.wav,Hindi,deceptive_story
00002.wav,English,true_story
00003.wav,English,deceptive_story
00004.wav,Bengali,deceptive_story
00005.wav,English,deceptive_story


In [2]:
import librosa

def split_audio_into_chunks(file_id, file_path, label, chunk_duration=30, sr=None):
    """
    Splits an audio file into 30-second chunks and discards chunks shorter than 30 seconds.

    Args:
        file_id (str): The file ID (original file name).
        file_path (str): Path to the audio file.
        label (str): Label for the audio file (e.g., 'true_story' or 'deceptive_story').
        chunk_duration (int): Duration of each chunk in seconds (default: 30).
        sr (int or None): Sampling rate. If None, uses the original rate.

    Returns:
        list: A list of tuples [(chunk, label, file_id)].
    """
    y, sr = librosa.load(file_path, sr=sr)  # Load audio
    chunk_size = chunk_duration * sr       # Calculate chunk size in samples
    chunks = [y[i:i + int(chunk_size)] for i in range(0, len(y), int(chunk_size))]

    # Discard chunks shorter than the desired length
    valid_chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]

    # Return a list of (chunk, label, file_id)
    return [(chunk, label, file_id) for chunk in valid_chunks]

# Iterate over all files and split them into chunks
audio_chunks = []
for file_id in tqdm(MLEND_df.index):  # Iterate over file IDs in the CSV
    file_path = base_path + file_id   # Construct full file path
    label = MLEND_df.loc[file_id, 'Story_type']  # Retrieve label from the CSV
    audio_chunks.extend(split_audio_into_chunks(file_id, file_path, label))

# Output the total number of valid chunks
print(f"Total valid audio chunks created: {len(audio_chunks)}")


100%|██████████| 100/100 [00:05<00:00, 19.45it/s]

Total valid audio chunks created: 420





In [14]:
import librosa
import pandas as pd
from tqdm import tqdm

def split_audio_into_chunks(file_id, file_path, label, chunk_duration=30, sr=None):
    """
    Splits an audio file into 30-second chunks and discards chunks shorter than 30 seconds.

    Args:
        file_id (str): The file ID (original file name).
        file_path (str): Path to the audio file.
        label (str): Label for the audio file (e.g., 'true_story' or 'deceptive_story').
        chunk_duration (int): Duration of each chunk in seconds (default: 30).
        sr (int or None): Sampling rate. If None, uses the original rate.

    Returns:
        tuple: File metadata (duration, number of valid chunks, label) and a list of chunk data with IDs.
    """
    y, sr = librosa.load(file_path, sr=sr)  # Load audio
    duration = len(y) / sr  # Calculate total duration in seconds
    chunk_size = chunk_duration * sr       # Calculate chunk size in samples
    chunks = [y[i:i + int(chunk_size)] for i in range(0, len(y), int(chunk_size))]

    # Discard chunks shorter than the desired length
    valid_chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]

    # Prepare metadata and chunks with Chunk ID
    metadata = {"File ID": file_id, "Duration (s)": duration, "Number of Chunks": len(valid_chunks), "Label": label}
    chunk_data = [(f"{file_id}_chunk{i+1}", chunk, label, file_id) for i, chunk in enumerate(valid_chunks)]
    
    return metadata, chunk_data

# Initialize lists for metadata and chunks
file_metadata = []
audio_chunks = []

# Iterate over all files and process
for file_id in tqdm(MLEND_df.index):  # Iterate over file IDs in the CSV
    file_path = base_path + file_id   # Construct full file path
    label = MLEND_df.loc[file_id, 'Story_type']  # Retrieve label from the CSV

    # Split into chunks and collect metadata
    metadata, chunks = split_audio_into_chunks(file_id, file_path, label)
    file_metadata.append(metadata)  # Collect metadata for the file
    audio_chunks.extend(chunks)     # Collect valid chunks

# Convert metadata to a DataFrame for easy viewing
metadata_df = pd.DataFrame(file_metadata)

# Display metadata
print("Summary of Audio Files:")
display(metadata_df)

# Prepare a DataFrame for audio chunks
chunks_df = pd.DataFrame(audio_chunks, columns=["Chunk ID", "Chunk Data", "Label", "File ID"])

# Display chunk summary
print("Summary of Audio Chunks:")
display(chunks_df[["Chunk ID", "Label", "File ID"]])

# Output the total number of valid chunks
print(f"Total valid audio chunks created: {len(audio_chunks)}")


100%|██████████| 100/100 [00:04<00:00, 21.64it/s]

Summary of Audio Files:





Unnamed: 0,File ID,Duration (s),Number of Chunks,Label
0,00001.wav,122.167256,4,deceptive_story
1,00002.wav,125.192018,4,true_story
2,00003.wav,162.984127,5,deceptive_story
3,00004.wav,121.681270,4,deceptive_story
4,00005.wav,134.189751,4,deceptive_story
...,...,...,...,...
95,00096.wav,111.512063,3,deceptive_story
96,00097.wav,185.731224,6,true_story
97,00098.wav,128.252766,4,deceptive_story
98,00099.wav,132.412562,4,true_story


Summary of Audio Chunks:


Unnamed: 0,Chunk ID,Label,File ID
0,00001.wav_chunk1,deceptive_story,00001.wav
1,00001.wav_chunk2,deceptive_story,00001.wav
2,00001.wav_chunk3,deceptive_story,00001.wav
3,00001.wav_chunk4,deceptive_story,00001.wav
4,00002.wav_chunk1,true_story,00002.wav
...,...,...,...
415,00099.wav_chunk4,true_story,00099.wav
416,00100.wav_chunk1,deceptive_story,00100.wav
417,00100.wav_chunk2,deceptive_story,00100.wav
418,00100.wav_chunk3,deceptive_story,00100.wav


Total valid audio chunks created: 420


In [None]:
audio_chunks[0]     #[(f"{file_id}_chunk{i+1}", chunk, label, file_id)

('00001.wav_chunk1',
 array([1.5258789e-05, 1.5258789e-05, 3.0517578e-05, ..., 2.8564453e-02,
        2.8488159e-02, 2.8121948e-02], dtype=float32),
 'deceptive_story',
 '00001.wav')

In [20]:
from collections import Counter
# Extract labels from the chunks
chunk_labels = [chunk[2] for chunk in audio_chunks]  # Extract the 'label' field from the tuples

# Count the occurrences of each label
label_counts = Counter(chunk_labels)

# Display the label distribution
print("Label Distribution in Chunks:")
for label, count in label_counts.items():
    print(f"{label}: {count} chunks")


Label Distribution in Chunks:
deceptive_story: 201 chunks
true_story: 219 chunks


show each recording has how many chunks and their corresponding label

- code required


Your dataset has the following label distribution:

True Stories: 219 chunks.
Deceptive Stories: 201 chunks.
This results in the following proportions:

True Stories: 
219
420
≈
52.14
%
420
219
​
 ≈52.14%
Deceptive Stories: 
201
420
≈
47.86
%
420
201
​
 ≈47.86%
Is This Balanced?
In machine learning, a dataset is generally considered balanced if:

The class proportions are roughly equal (e.g., both classes are close to 50% in binary classification).

In [22]:
import numpy as np
import librosa
from tqdm import tqdm

def extract_features_and_labels(audio_chunks, sr=16000, scale_audio=True):
    """
    Extract features (MFCC, Pitch, Energy, ZCR) and associate labels for 30-second audio chunks.

    Args:
        audio_chunks (list): List of tuples [(chunk, label, file_id)].
        n_mfcc (int): Number of MFCC coefficients to extract.
        sr (int): Sampling rate for feature extraction.
        scale_audio (bool): Whether to scale audio amplitude.

    Returns:
        np.ndarray: Feature matrix (X) with extracted features for each chunk.
        np.ndarray: Label vector (y) corresponding to each chunk.
    """
    X, y = [], []

    for chunk_id, chunk, label, file_id in tqdm(audio_chunks):
        # Scale the audio if required
        if scale_audio:
            chunk = chunk / np.max(np.abs(chunk)) if np.max(np.abs(chunk)) > 0 else chunk

        # Extract MFCC features
        n_mfcc =13
        mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=n_mfcc).mean(axis=1)

        # Extract Pitch features
        pitch, _, _ = librosa.pyin(chunk, fmin=80, fmax=450, sr=sr)
        pitch_mean = np.nanmean(pitch) if np.mean(np.isnan(pitch)) < 1 else 0
        pitch_std = np.nanstd(pitch) if np.mean(np.isnan(pitch)) < 1 else 0

        # Compute Energy (RMS)
        rms = np.mean(librosa.feature.rms(y=chunk))

        # Compute Zero-Crossing Rate (ZCR)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=chunk))

        # Combine all features into a single feature vector
        xi = np.concatenate([mfcc, [pitch_mean, pitch_std, rms, zcr]])
        X.append(xi)

        # Append the corresponding label
        y.append(1 if label == 'true_story' else 0)  # Binary encode labels

    return np.array(X), np.array(y)

In [None]:
# Ensure feature extraction is performed
X, y = extract_features_and_labels(audio_chunks, sr=16000, scale_audio=True)

# Check the shapes of X and y
print(f"Feature matrix shape: {X.shape}")
print(f"Label vector shape: {y.shape}")

#12.30s

100%|██████████| 420/420 [12:30<00:00,  1.79s/it]

Feature matrix shape: (420, 17)
Label vector shape: (420,)





In [26]:
import pandas as pd

# Convert feature matrix (X) into a DataFrame for better readability
feature_columns = [f"MFCC_{i+1}" for i in range(13)] + ["Pitch_Mean", "Pitch_Std", "RMS", "ZCR"]
features_df = pd.DataFrame(X, columns=feature_columns)

# Add corresponding labels for better context
features_df["Label"] = y
features_df["File_ID"] = chunks_df['Chunk ID']

# Display the first few rows of the DataFrame
print("First 10 rows of the extracted features:")
display(features_df.head(10))


First 10 rows of the extracted features:


Unnamed: 0,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,MFCC_10,MFCC_11,MFCC_12,MFCC_13,Pitch_Mean,Pitch_Std,RMS,ZCR,Label,File_ID
0,-397.277435,115.308823,26.619108,4.842953,15.125876,13.251455,8.787902,7.114931,5.688897,3.441538,-0.148061,4.820008,14.553017,89.916438,11.904371,0.067561,0.052729,0,00001.wav_chunk1
1,-341.413879,112.532097,26.87512,3.859308,10.727917,9.089574,8.194948,8.354401,6.138742,3.457012,0.086589,4.308666,13.157206,100.921285,57.813181,0.097667,0.055318,0,00001.wav_chunk2
2,-355.009125,119.337051,24.410446,6.237376,14.495246,9.676167,8.971926,7.552096,3.964255,3.650847,0.749403,4.240197,13.81474,100.859701,53.138592,0.105106,0.058541,0,00001.wav_chunk3
3,-388.253235,116.238152,26.251518,5.487333,13.262886,9.602457,6.639931,5.021115,4.648613,5.378964,0.879893,3.370997,13.841707,98.482368,38.953589,0.066512,0.051922,0,00001.wav_chunk4
4,-341.225281,118.12944,21.087967,23.655165,2.626698,27.793694,16.04888,7.064847,5.248309,-0.175199,13.49232,5.19523,-5.62233,131.835363,69.975678,0.052946,0.069831,1,00002.wav_chunk1
5,-352.747955,123.446228,22.154636,22.628866,3.410473,31.359163,17.365978,5.906202,3.478702,-0.525651,12.964216,3.917714,-7.604085,113.318635,52.48419,0.05243,0.067662,1,00002.wav_chunk2
6,-335.256531,114.930779,21.377239,21.028791,7.294496,27.854549,18.035589,6.776508,3.678071,1.138009,12.520448,1.734315,-6.671849,119.723847,61.242451,0.057172,0.070031,1,00002.wav_chunk3
7,-321.387543,120.846916,23.287342,22.254333,4.051879,27.580341,17.535589,7.562186,5.637784,2.195768,14.759595,1.976602,-8.822467,116.887366,53.195553,0.068936,0.061652,1,00002.wav_chunk4
8,-342.061371,119.052788,33.504265,10.626637,22.60763,32.303413,15.722194,8.065565,-1.798357,-2.87368,-4.494293,-7.010339,-7.906294,84.149052,7.553484,0.118167,0.060897,0,00003.wav_chunk1
9,-332.177429,119.742607,34.443279,15.258172,22.072105,27.921467,14.051162,8.111733,-1.800246,-6.144019,-1.627588,-4.922258,-10.025845,111.379344,81.207328,0.117293,0.065956,0,00003.wav_chunk2


In [27]:
from sklearn.model_selection import train_test_split

# Split into train-validation and test sets (80% train-validation, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Further split the train-validation set into training and validation sets (75% train, 25% validation of train-validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42
)

# Output shapes
print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (252, 17)
Validation set size: (84, 17)
Test set size: (84, 17)


If you have 420 chunks:
Training set size: 60%×420=252
Validation set size: 20%×420=84
Test set size: 20%×420=84

Final Distribution:
After the second split, the proportions of the original dataset are:
Training Set: 80%×0.75=60%
Validation Set: 80%×0.25=20%
Test Set: 20%
This matches the common practice of splitting datasets into 60% training, 20% validation, and 20% testing.

Why Not Split Directly into 60-20-20?
If you split directly into 60% training, 20% validation, and 20% testing, you lose flexibility to use cross-validation or other techniques on the training-validation set.
By splitting into 80% training-validation and 20% testing first, you can adjust the train-validation ratio later without impacting the test set.

Stratifying y during train-validation-test splits ensures that the proportion of classes (e.g., true_story and deceptive_story) is maintained across all subsets of the data. Here’s why this is important:

1. Maintain Class Balance
If the dataset is imbalanced (e.g., 70% true_story and 30% deceptive_story), random splitting without stratification might lead to subsets with unequal distributions, such as:

Training set: 80% true_story and 20% deceptive_story
Validation set: 90% true_story and 10% deceptive_story
Test set: 50% true_story and 50% deceptive_story
This imbalance can skew the model's performance metrics because the model might:

Overfit to the majority class in the training set.
Face challenges in evaluating minority class performance in the validation or test set.
By stratifying y, the class proportions are preserved in all subsets.

2. Reliable Model Evaluation
Stratification ensures the validation and test sets are representative of the entire dataset, leading to:
More reliable evaluation of model performance.
Fair comparison of models during hyperparameter tuning.
3. Reduce Variance in Results
Without stratification, results may vary significantly with different random splits because the class distributions can vary across subsets.
Stratification minimizes this variance, making the splits more stable and reproducible.

Conclusion
Stratifying y helps maintain class balance across subsets, making the model training and evaluation more consistent and reliable, especially in datasets with imbalanced class distributions.


In [28]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_normalized = scaler.fit_transform(X_train)

# Use the scaler to transform validation and test data
X_val_normalized = scaler.transform(X_val)
X_test_normalized = scaler.transform(X_test)

# Print the shapes of the normalized datasets
print(f"Normalized Training Data Shape: {X_train_normalized.shape}")
print(f"Normalized Validation Data Shape: {X_val_normalized.shape}")
print(f"Normalized Test Data Shape: {X_test_normalized.shape}")


Normalized Training Data Shape: (252, 17)
Normalized Validation Data Shape: (84, 17)
Normalized Test Data Shape: (84, 17)


In [37]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# SVM with linear kernel
svm_linear = SVC(kernel='linear', C=1, random_state=42, probability=True)
svm_linear.fit(X_train_normalized, y_train)
y_val_pred_linear = svm_linear.predict(X_val_normalized)
linear_val_acc = accuracy_score(y_val, y_val_pred_linear)
linear_val_f1 = f1_score(y_val, y_val_pred_linear)
print(f"SVM (Linear Kernel) Validation Accuracy: {linear_val_acc:.4f}, F1 Score: {linear_val_f1:.4f}")

# SVM with rbf kernel
svm_rbf = SVC(C=1, gamma='scale', random_state=42, probability=True)
svm_rbf.fit(X_train_normalized, y_train)
y_val_pred_rbf = svm_rbf.predict(X_val_normalized)
rbf_val_acc = accuracy_score(y_val, y_val_pred_rbf)
rbf_val_f1 = f1_score(y_val, y_val_pred_rbf)
print(f"SVM (RBF Kernel) Validation Accuracy: {rbf_val_acc:.4f}, F1 Score: {rbf_val_f1:.4f}")


SVM (Linear Kernel) Validation Accuracy: 0.6310, F1 Score: 0.6667
SVM (RBF Kernel) Validation Accuracy: 0.8452, F1 Score: 0.8506


In [68]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Initialize the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# SVM with linear kernel
svm_linear = SVC(kernel='linear', C=1, probability=True, random_state=42)
linear_acc_scores = []
linear_f1_scores = []

# Cross-validation for linear kernel
for train_idx, val_idx in cv.split(X_train_normalized, y_train):
    X_train_fold, X_val_fold = X_train_normalized[train_idx], X_train_normalized[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    svm_linear.fit(X_train_fold, y_train_fold)
    y_val_pred = svm_linear.predict(X_val_fold)
    linear_acc_scores.append(accuracy_score(y_val_fold, y_val_pred))
    linear_f1_scores.append(f1_score(y_val_fold, y_val_pred))

svm_linear_mean_acc = sum(linear_acc_scores) / len(linear_acc_scores)
svm_linear_mean_f1 = sum(linear_f1_scores) / len (linear_f1_scores)


# SVM with RBF kernel
svm_rbf = SVC(C=1, gamma='scale', probability=True, random_state=42)
rbf_acc_scores = []
rbf_f1_scores = []

# Cross-validation for RBF kernel
for train_idx, val_idx in cv.split(X_train_normalized, y_train):
    X_train_fold, X_val_fold = X_train_normalized[train_idx], X_train_normalized[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    svm_rbf.fit(X_train_fold, y_train_fold)
    y_val_pred = svm_rbf.predict(X_val_fold)
    rbf_acc_scores.append(accuracy_score(y_val_fold, y_val_pred))
    rbf_f1_scores.append(f1_score(y_val_fold, y_val_pred))

svm_rbf_mean_acc = sum (rbf_acc_scores) / len(rbf_acc_scores)
svm_rbf_mean_f1 = sum (rbf_f1_scores) / len (rbf_f1_scores) 

# Print the results
print("SVM (Linear Kernel) Cross-Validation Accuracy Scores:", linear_acc_scores)
print("SVM (Linear Kernel) Mean Accuracy:", svm_linear_mean_acc)
print("SVM (Linear Kernel) Cross-Validation F1 Scores:", linear_f1_scores)
print("SVM (Linear Kernel) Mean F1 Score:", svm_linear_mean_f1)

print("\nSVM (RBF Kernel) Cross-Validation Accuracy Scores:", rbf_acc_scores)
print("SVM (RBF Kernel) Mean Accuracy:", svm_rbf_mean_acc)
print("SVM (RBF Kernel) Cross-Validation F1 Scores:", rbf_f1_scores)
print("SVM (RBF Kernel) Mean F1 Score:", svm_rbf_mean_f1)


SVM (Linear Kernel) Cross-Validation Accuracy Scores: [0.6862745098039216, 0.5882352941176471, 0.66, 0.54, 0.62]
SVM (Linear Kernel) Mean Accuracy: 0.6189019607843138
SVM (Linear Kernel) Cross-Validation F1 Scores: [0.7241379310344828, 0.6181818181818182, 0.7017543859649122, 0.5660377358490566, 0.6415094339622641]
SVM (Linear Kernel) Mean F1 Score: 0.6503242609985067

SVM (RBF Kernel) Cross-Validation Accuracy Scores: [0.8431372549019608, 0.803921568627451, 0.86, 0.74, 0.72]
SVM (RBF Kernel) Mean Accuracy: 0.7934117647058823
SVM (RBF Kernel) Cross-Validation F1 Scores: [0.8571428571428571, 0.8214285714285714, 0.8679245283018868, 0.7636363636363637, 0.7586206896551724]
SVM (RBF Kernel) Mean F1 Score: 0.8137506020329702


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
log_reg.fit(X_train_normalized, y_train)

# Evaluate on the training set
y_train_pred = log_reg.predict(X_train_normalized)
train_acc = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
print(f"Logistic Regression Training Accuracy: {train_acc:.4f}")
print(f"Logistic Regression Training F1 Score: {train_f1:.4f}")

# Evaluate on the validation set
y_val_pred = log_reg.predict(X_val_normalized)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
print(f"Logistic Regression Validation Accuracy: {val_acc:.4f}")
print(f"Logistic Regression Validation F1 Score: {val_f1:.4f}")


Logistic Regression Training Accuracy: 0.6548
Logistic Regression Training F1 Score: 0.6836
Logistic Regression Validation Accuracy: 0.5952
Logistic Regression Validation F1 Score: 0.6304


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

# Initialize model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
log_reg_acc_scores = []
log_reg_f1_scores = []

for train_idx, val_idx in cv.split(X_train_normalized, y_train):
    X_train_fold, X_val_fold = X_train_normalized[train_idx], X_train_normalized[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    log_reg.fit(X_train_fold, y_train_fold)
    y_val_pred = log_reg.predict(X_val_fold)
    log_reg_acc_scores.append(accuracy_score(y_val_fold, y_val_pred))
    log_reg_f1_scores.append(f1_score(y_val_fold, y_val_pred))

# Print results
print(f"Logistic Regression Cross-Validation Accuracy Scores: {log_reg_acc_scores}")
print(f"Mean Accuracy: {sum(log_reg_acc_scores) / len(log_reg_acc_scores):.4f}")
print(f"Logistic Regression Cross-Validation F1 Scores: {log_reg_f1_scores}")
print(f"Mean F1 Score: {sum(log_reg_f1_scores) / len(log_reg_f1_scores):.4f}")


Logistic Regression Cross-Validation Accuracy Scores: [0.6666666666666666, 0.6274509803921569, 0.62, 0.54, 0.6]
Mean Accuracy: 0.6108
Logistic Regression Cross-Validation F1 Scores: [0.7017543859649122, 0.6545454545454545, 0.6666666666666666, 0.5818181818181818, 0.6153846153846154]
Mean F1 Score: 0.6440


In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)

# Train the model
dt_model.fit(X_train_normalized, y_train)

# Evaluate on the training set
y_train_pred = dt_model.predict(X_train_normalized)
train_acc = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
print(f"Decision Tree Training Accuracy: {train_acc:.4f}")
print(f"Decision Tree Training F1 Score: {train_f1:.4f}")

# Evaluate on the validation set
y_val_pred = dt_model.predict(X_val_normalized)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
print(f"Decision Tree Validation Accuracy: {val_acc:.4f}")
print(f"Decision Tree Validation F1 Score: {val_f1:.4f}")


Decision Tree Training Accuracy: 0.9841
Decision Tree Training F1 Score: 0.9847
Decision Tree Validation Accuracy: 0.7857
Decision Tree Validation F1 Score: 0.7907


In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define parameter grid
param_grid = {'max_depth': [2, 3, 5, 10, 15, 20, None]}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform Grid Search
grid_search.fit(X_train_normalized, y_train)

# Best depth and score
print(f"Best max_depth: {grid_search.best_params_['max_depth']}")
print(f"Best Accuracy Score: {grid_search.best_score_}")


Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best max_depth: 10
Best Accuracy Score: 0.7222745098039216


In [59]:
from sklearn.metrics import accuracy_score, f1_score

for depth in [2, 3, 5, 10, 15, 20, None]:
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=depth)
    dt_model.fit(X_train_normalized, y_train)
    y_val_pred = dt_model.predict(X_val_normalized)
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    print(f"Max Depth: {depth}, Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")


Max Depth: 2, Validation Accuracy: 0.6429, F1 Score: 0.6154
Max Depth: 3, Validation Accuracy: 0.6548, F1 Score: 0.5915
Max Depth: 5, Validation Accuracy: 0.7262, F1 Score: 0.7089
Max Depth: 10, Validation Accuracy: 0.7857, F1 Score: 0.7907
Max Depth: 15, Validation Accuracy: 0.7619, F1 Score: 0.7561
Max Depth: 20, Validation Accuracy: 0.7619, F1 Score: 0.7561
Max Depth: None, Validation Accuracy: 0.7619, F1 Score: 0.7561


In [64]:
from sklearn.tree import DecisionTreeClassifier

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)  # Adjust max_depth as needed

# Cross-validation
dt_acc_scores = []
dt_f1_scores = []

for train_idx, val_idx in cv.split(X_train_normalized, y_train):
    X_train_fold, X_val_fold = X_train_normalized[train_idx], X_train_normalized[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    dt_model.fit(X_train_fold, y_train_fold)
    y_val_pred = dt_model.predict(X_val_fold)
    dt_acc_scores.append(accuracy_score(y_val_fold, y_val_pred))
    dt_f1_scores.append(f1_score(y_val_fold, y_val_pred))

# Print results
print(f"Decision Tree Cross-Validation Accuracy Scores: {dt_acc_scores}")
print(f"Mean Accuracy: {sum(dt_acc_scores) / len(dt_acc_scores):.4f}")
print(f"Decision Tree Cross-Validation F1 Scores: {dt_f1_scores}")
print(f"Mean F1 Score: {sum(dt_f1_scores) / len(dt_f1_scores):.4f}")


Decision Tree Cross-Validation Accuracy Scores: [0.6666666666666666, 0.6470588235294118, 0.72, 0.64, 0.66]
Mean Accuracy: 0.6667
Decision Tree Cross-Validation F1 Scores: [0.6222222222222222, 0.64, 0.72, 0.5714285714285714, 0.6792452830188679]
Mean F1 Score: 0.6466


In [62]:
models = {
    "SVM (RBF Kernel)": svm_rbf,
    "Logistic Regression": log_reg,
    "Decision Tree": dt_model
}

for name, model in models.items():
    model.fit(X_train_normalized, y_train)
    y_val_pred = model.predict(X_val_normalized)
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")


SVM (RBF Kernel) Validation Accuracy: 0.8452, F1 Score: 0.8506
Logistic Regression Validation Accuracy: 0.5952, F1 Score: 0.6304
Decision Tree Validation Accuracy: 0.7857, F1 Score: 0.7907


In [69]:
# Results dictionary for easy comparison
model_results = {
    "SVM (Linear Kernel)": {
        "Mean Accuracy": svm_linear_mean_acc,
        "Mean F1 Score": svm_linear_mean_f1
    },
    "SVM (RBF Kernel)": {
        "Mean Accuracy": svm_rbf_mean_acc,
        "Mean F1 Score": svm_rbf_mean_f1
    },
    "Logistic Regression": {
        "Mean Accuracy": sum(log_reg_acc_scores) / len(log_reg_acc_scores),
        "Mean F1 Score": sum(log_reg_f1_scores) / len(log_reg_f1_scores)
    },
    "Decision Tree": {
        "Mean Accuracy": sum(dt_acc_scores) / len(dt_acc_scores),
        "Mean F1 Score": sum(dt_f1_scores) / len(dt_f1_scores)
    }
}

# Print results
print("Model Performance Comparison:")
for model, metrics in model_results.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")


Model Performance Comparison:

SVM (Linear Kernel):
  Mean Accuracy: 0.6189
  Mean F1 Score: 0.6503

SVM (RBF Kernel):
  Mean Accuracy: 0.7934
  Mean F1 Score: 0.8138

Logistic Regression:
  Mean Accuracy: 0.6108
  Mean F1 Score: 0.6440

Decision Tree:
  Mean Accuracy: 0.6667
  Mean F1 Score: 0.6466


In [74]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Initialize individual models
svm_rbf = SVC(kernel='rbf', C=1, gamma='scale', probability=True, random_state=42)
decision_tree = DecisionTreeClassifier(max_depth=10, random_state=42)

# Train individual models
svm_rbf.fit(X_train_normalized, y_train)
decision_tree.fit(X_train_normalized, y_train)

# Get predictions from individual models
svm_preds = svm_rbf.predict(X_val_normalized)
dt_preds = decision_tree.predict(X_val_normalized)

# Combine predictions into an array
all_preds = np.array([svm_preds, dt_preds])

# Perform majority voting
majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_preds)

# Evaluate the majority voting ensemble
val_acc_majority = accuracy_score(y_val, majority_votes)
val_f1_majority = f1_score(y_val, majority_votes)

print(f"Majority Voting Validation Accuracy: {val_acc_majority:.4f}")
print(f"Majority Voting Validation F1 Score: {val_f1_majority:.4f}")


Majority Voting Validation Accuracy: 0.8214
Majority Voting Validation F1 Score: 0.8101


In [73]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Majority voting for the training set
svm_train_preds = svm_rbf.predict(X_train_normalized)
dt_train_preds = decision_tree.predict(X_train_normalized)

# Combine predictions
all_train_preds = np.array([svm_train_preds, dt_train_preds])

# Perform majority voting for training
majority_votes_train = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_train_preds)

# Evaluate on training data
train_acc_majority = accuracy_score(y_train, majority_votes_train)
train_f1_majority = f1_score(y_train, majority_votes_train)

print(f"Majority Voting Training Accuracy: {train_acc_majority:.4f}")
print(f"Majority Voting Training F1 Score: {train_f1_majority:.4f}")

# Majority voting for the validation set
svm_val_preds = svm_rbf.predict(X_val_normalized)
dt_val_preds = decision_tree.predict(X_val_normalized)

# Combine predictions
all_val_preds = np.array([svm_val_preds, dt_val_preds])

# Perform majority voting for validation
majority_votes_val = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_val_preds)

# Evaluate on validation data
val_acc_majority = accuracy_score(y_val, majority_votes_val)
val_f1_majority = f1_score(y_val, majority_votes_val)

print(f"Majority Voting Validation Accuracy: {val_acc_majority:.4f}")
print(f"Majority Voting Validation F1 Score: {val_f1_majority:.4f}")


Majority Voting Training Accuracy: 0.9683
Majority Voting Training F1 Score: 0.9685
Majority Voting Validation Accuracy: 0.8214
Majority Voting Validation F1 Score: 0.8101


In [78]:
# Train SVM (RBF Kernel)
svm_rbf.fit(X_train_normalized, y_train)

# Train Decision Tree
decision_tree.fit(X_train_normalized, y_train)

# Train Logistic Regression
log_reg.fit(X_train_normalized, y_train)


In [79]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Predictions on the training set
svm_train_preds = svm_rbf.predict(X_train_normalized)
dt_train_preds = decision_tree.predict(X_train_normalized)
lr_train_preds = log_reg.predict(X_train_normalized)

# Combine training predictions into an array
all_train_preds = np.array([svm_train_preds, dt_train_preds, lr_train_preds])

# Perform majority voting for training
majority_votes_train = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_train_preds)

# Evaluate majority voting ensemble on training data
train_acc_majority = accuracy_score(y_train, majority_votes_train)
train_f1_majority = f1_score(y_train, majority_votes_train)

print(f"Majority Voting Training Accuracy: {train_acc_majority:.4f}")
print(f"Majority Voting Training F1 Score: {train_f1_majority:.4f}")

# Predictions on the validation set
svm_val_preds = svm_rbf.predict(X_val_normalized)
dt_val_preds = decision_tree.predict(X_val_normalized)
lr_val_preds = log_reg.predict(X_val_normalized)

# Combine validation predictions into an array
all_val_preds = np.array([svm_val_preds, dt_val_preds, lr_val_preds])

# Perform majority voting for validation
majority_votes_val = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=all_val_preds)

# Evaluate majority voting ensemble on validation data
val_acc_majority = accuracy_score(y_val, majority_votes_val)
val_f1_majority = f1_score(y_val, majority_votes_val)

print(f"Majority Voting Validation Accuracy: {val_acc_majority:.4f}")
print(f"Majority Voting Validation F1 Score: {val_f1_majority:.4f}")


Majority Voting Training Accuracy: 0.9444
Majority Voting Training F1 Score: 0.9474
Majority Voting Validation Accuracy: 0.7976
Majority Voting Validation F1 Score: 0.8046


In [81]:
from sklearn.ensemble import VotingClassifier

# Weighted Voting Classifier
ensemble_weighted = VotingClassifier(
    estimators=[
        ('SVM_RBF', svm_rbf),
        ('Decision_Tree', decision_tree),
        ('Logistic_Regression', log_reg)
    ],
    voting='soft',  # Use probabilities for soft voting
    weights=[0.5, 0.4, 0.1]  # Assign weights based on individual performance
)

# Train the ensemble model
ensemble_weighted.fit(X_train_normalized, y_train)

# Evaluate on the validation set
y_val_pred_weighted = ensemble_weighted.predict(X_val_normalized)
val_acc_weighted = accuracy_score(y_val, y_val_pred_weighted)
val_f1_weighted = f1_score(y_val, y_val_pred_weighted)

print(f"Weighted Voting Validation Accuracy: {val_acc_weighted:.4f}")
print(f"Weighted Voting Validation F1 Score: {val_f1_weighted:.4f}")


Weighted Voting Validation Accuracy: 0.7857
Weighted Voting Validation F1 Score: 0.7955
