In [93]:
import pickle
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler

In [169]:
with open("../../data/extracted_features/mfccs/mfccs_dict_13.pickle", "rb") as file:
    mfccs_dict = pickle.load(file)

for reader in mfccs_dict.keys():
    print(f"reader: {reader}")
    for chapter in mfccs_dict[reader].keys():
        print(f"    chapter: {chapter}")

reader: 19
    chapter: 198
    chapter: 227
reader: 26
    chapter: 495
    chapter: 496
reader: 27
    chapter: 123349
    chapter: 124992
reader: 32
    chapter: 21625
    chapter: 21631
    chapter: 21634
    chapter: 4137
reader: 39
    chapter: 121914
    chapter: 121915
    chapter: 121916
reader: 40
    chapter: 121026
    chapter: 222
reader: 60
    chapter: 121082
reader: 78
    chapter: 368
    chapter: 369
reader: 83
    chapter: 11691
    chapter: 3054
    chapter: 9960
reader: 87
    chapter: 121553


In [187]:
def cluster_mfcc(mfcc_list, n_clusters=2):
    """
    Clusters MFCC data for multiple speakers and calculates the percentage of samples in each cluster.

    Parameters:
        mfcc_list (list of np.ndarray): A list containing the combined MFCC arrays for each speaker.
        n_clusters (int): The number of clusters for K-means.

    Returns:
        None
    """
    # Stack the MFCCs for all speakers
    combined_mfcc = np.hstack(mfcc_list)  # Shape (13, total_frames)

    # Transpose to have the shape (total_frames, 13)
    combined_mfcc = combined_mfcc.T  # Now shape will be (total_frames, 13)

    # Normalize the MFCCs
    scaler = StandardScaler()
    combined_mfcc_normalized = scaler.fit_transform(combined_mfcc)

    # Run K-means clustering on the normalized MFCCs
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(combined_mfcc_normalized)

    # Get cluster labels
    labels = kmeans.labels_

    return labels, kmeans, scaler

def combine_mfcc(speaker):
    chapters = []
    for chapter in speaker.keys():
        chapters.extend(speaker[chapter])

    all_data = np.hstack(chapters)
    split_index = int(0.8 * all_data.shape[1])  # 80% of n

    # Split the data into training and testing sets
    train = all_data[:, :split_index]  # First 80% for training
    test = all_data[:, split_index:]    # Last 20% for testing

    return train, test


def calculate_percentages(labels, mfcc_list, n_clusters=10):
    
    # Initialize counts for each speaker
    counts = {i: {label: 0 for label in range(n_clusters)} for i in range(len(mfcc_list))}

    # Count occurrences of labels for each speaker
    total_samples = [mfcc.shape[1] for mfcc in mfcc_list]  # Total samples per speaker

    # Count occurrences of labels for each speaker
    total_samples = [mfcc.shape[1] for mfcc in mfcc_list]  # Total samples per speaker

    for speaker_index, mfcc in enumerate(mfcc_list):
        for i in range(mfcc.shape[1]):  # Iterate over the samples for this speaker
            if labels[i + sum(total_samples[:speaker_index])] in counts[speaker_index]:
                counts[speaker_index][labels[i + sum(total_samples[:speaker_index])]] += 1

    # Calculate and print percentages for each speaker
    for speaker_index, mfcc in enumerate(mfcc_list):
        print(f"\nSpeaker {speaker_index + 1} Percentages:")
        for label in range(n_clusters):
            percent = (counts[speaker_index][label] / total_samples[speaker_index]) * 100 if total_samples[speaker_index] > 0 else 0
            print(f"  Percent in Label {label}: {percent:.2f}%")

In [191]:

train_mfcc_1, test_mfcc_1 = combine_mfcc(mfccs_dict["19"])
train_mfcc_2, test_mfcc_2 = combine_mfcc(mfccs_dict["26"])
train_mfcc_3, test_mfcc_3 = combine_mfcc(mfccs_dict["27"])
train_mfcc_4, test_mfcc_4 = combine_mfcc(mfccs_dict["32"])
train_mfcc_5, test_mfcc_5 = combine_mfcc(mfccs_dict["39"])
train_mfcc_6, test_mfcc_6 = combine_mfcc(mfccs_dict["40"])
train_mfcc_7, test_mfcc_7 = combine_mfcc(mfccs_dict["60"])
train_mfcc_8, test_mfcc_8 = combine_mfcc(mfccs_dict["78"])
train_mfcc_9, test_mfcc_9 = combine_mfcc(mfccs_dict["83"])
train_mfcc_10, test_mfcc_10 = combine_mfcc(mfccs_dict["87"])

print(train_mfcc_1.shape, test_mfcc_1.shape)
# List of combined MFCCs for multiple speakers
mfcc_list_train = [train_mfcc_1, train_mfcc_2, train_mfcc_3, train_mfcc_4, train_mfcc_5, train_mfcc_6,
             train_mfcc_7, train_mfcc_8, train_mfcc_9, train_mfcc_10]

labels, kmeans, scaler = cluster_mfcc(mfcc_list_train, n_clusters=10)

calculate_percentages(labels, mfcc_list_train)


(13, 37822) (13, 9456)

Speaker 1 Percentages:
  Percent in Label 0: 3.94%
  Percent in Label 1: 9.55%
  Percent in Label 2: 3.43%
  Percent in Label 3: 14.17%
  Percent in Label 4: 22.19%
  Percent in Label 5: 20.86%
  Percent in Label 6: 2.95%
  Percent in Label 7: 5.78%
  Percent in Label 8: 8.94%
  Percent in Label 9: 8.18%

Speaker 2 Percentages:
  Percent in Label 0: 0.41%
  Percent in Label 1: 8.97%
  Percent in Label 2: 8.57%
  Percent in Label 3: 10.34%
  Percent in Label 4: 26.65%
  Percent in Label 5: 0.80%
  Percent in Label 6: 7.26%
  Percent in Label 7: 10.50%
  Percent in Label 8: 10.74%
  Percent in Label 9: 15.76%

Speaker 3 Percentages:
  Percent in Label 0: 1.51%
  Percent in Label 1: 8.03%
  Percent in Label 2: 8.64%
  Percent in Label 3: 14.87%
  Percent in Label 4: 26.10%
  Percent in Label 5: 2.73%
  Percent in Label 6: 9.96%
  Percent in Label 7: 4.59%
  Percent in Label 8: 9.85%
  Percent in Label 9: 13.71%

Speaker 4 Percentages:
  Percent in Label 0: 8.35%
  

In [193]:
def calculate_label_percentages(labels):
    """Calculate the percentage of each label in the given labels."""
    unique, counts = np.unique(labels, return_counts=True)
    total = len(labels)
    percentages = {label: (count / total) * 100 for label, count in zip(unique, counts)}
    return percentages

new_data_normalized = scaler.transform(test_mfcc_2.T)
predicted_labels = kmeans.predict(new_data_normalized)

label_percentages = calculate_label_percentages(predicted_labels)

print("Predicted cluster labels for new data:", predicted_labels)
print("Percentage of each label:")
for label, percentage in label_percentages.items():
    print(f"  Label {label}: {percentage:.2f}%")

Predicted cluster labels for new data: [7 7 7 ... 4 4 4]
Percentage of each label:
  Label 0: 0.31%
  Label 1: 8.25%
  Label 2: 9.78%
  Label 3: 12.73%
  Label 4: 28.74%
  Label 5: 1.44%
  Label 6: 7.84%
  Label 7: 6.78%
  Label 8: 9.65%
  Label 9: 14.47%


In [None]:
# print(combined_mfcc_1.shape)
# print(combined_mfcc_2.shape)
# print(combined_mfcc_3.shape)
# print(combined_mfcc_4.shape)
# print(combined_mfcc_5.shape)
# print(combined_mfcc_6.shape)
# print(combined_mfcc_7.shape)
# print(combined_mfcc_8.shape)
# print(combined_mfcc_9.shape)
# print(combined_mfcc_10.shape)


# Stack the MFCCs for all speakers
mfcc_list_train = [train_mfcc_1, train_mfcc_2, train_mfcc_3, train_mfcc_4, train_mfcc_5, train_mfcc_6,
             train_mfcc_7, train_mfcc_8, train_mfcc_9, train_mfcc_10]
