In [100]:
import pickle
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.metrics import confusion_matrix, f1_score

In [93]:
with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_v2.pickle", "rb") as file:
    mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")
    # for mfcc, file_id in mfcc_stats_dict[reader]:
    #     print(f"\t 1st 2 mfcc: {mfcc} | # features: {mfcc.shape[0]} | file ID: {file_id}")

reader: 19 | # samples 23
reader: 201 | # samples 79
reader: 26 | # samples 50
reader: 27 | # samples 60
reader: 311 | # samples 105
reader: 40 | # samples 36
reader: 87 | # samples 100


In [113]:
def get_lowest_data(mfcc_dict):
    number_of_samples = []
    for reader in mfcc_dict.keys():
        number_of_samples.append(len(mfcc_dict[reader]))
    
    return min(number_of_samples)

def partition_data(reader, max, partition_percent=.80):
    mfccs = [mfcc for mfcc, _ in reader]
    mfccs = mfccs[0:max]

    partition_index = round(len(mfccs) * partition_percent)
    
    train = mfccs[0:partition_index]
    test = mfccs[partition_index: len(mfccs)]

    return train, test

def separate_labels(labels, speaker_keys):
    separated_labels = []
    current_index = 0
    for key in speaker_keys:
        separated_labels.append(labels[current_index:current_index+key])
        current_index += key
    return(separated_labels)

def calculate_percentage(data):
    percentages = []
    for sublist in data:
        total_count = len(sublist)
        if total_count == 0:
            percentages.append({})
            continue
        
        count_dict = {}
        for num in sublist:
            count_dict[num] = count_dict.get(num, 0) + 1
        
        percentage_dict = {num: (count / total_count) * 100 for num, count in count_dict.items()}
        percentages.append(percentage_dict)
    
    return percentages

def format_percentages(percentages):
    for i, percentage_dict in enumerate(percentages):
        if not percentage_dict:
            print(f"Sublist {i + 1}: No data")
            continue
        
        # Sort by percentage in descending order
        sorted_percentages = sorted(percentage_dict.items(), key=lambda x: x[1], reverse=True)
        
        # Format and print each sublist
        formatted_str = f"Speaker {i + 1}:\n\t" + ", ".join(
            f"{num}: {percent:.2f}%" for num, percent in sorted_percentages
        )
        print(formatted_str)

def create_truth_list(samples, label_order):
    """
    label_order: 
        0 for 1st half 0 and 2nd half 1
        1 for 1st half 1 and 2nd half 0
    """
    if label_order == 0:
        first_half  = [0 for _ in range(0, samples//2)]
        second_half = [1 for _ in range(0, samples//2)]        
    elif label_order == 1:
        first_half  = [1 for _ in range(0, samples//2)]
        second_half = [0 for _ in range(0, samples//2)]
    else:
        print("use 0 or 1")
    
    first_half.extend(second_half)
    return first_half


In [None]:
# mfccs_train_1 = [mfcc for mfcc, _ in mfcc_stats_dict["19"]]

# mfccs_train_1 = [mfcc for mfcc, _ in mfcc_stats_dict["26"]]
mfccs_train_2 = [mfcc for mfcc, _ in mfcc_stats_dict["201"]]
mfccs_train_3 = [mfcc for mfcc, _ in mfcc_stats_dict["311"]]

training_label_key = [len(mfccs_train_2), len(mfccs_train_3)]

# print(mfccs_train_1[0].shape)

# stacked = np.vstack([mfccs_train_1, mfccs_train_2, mfccs_train_3])
stacked = np.vstack([mfccs_train_2, mfccs_train_3])
print(stacked.shape)

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(stacked)
labels = kmeans.labels_
separated_labels_train = separate_labels(labels, training_label_key)
format_percentages(calculate_percentage(separated_labels_train))
# print(labels)

In [None]:
####################
# different gender # 
# 87 woman 201 man #
####################

max_number_of_sample_87_201 = 79   # for 87 and 201
print(f"max number of sampls 87 & 201: {max_number_of_sample_87_201}")

mfcc_train_87, mfcc_test_87 = partition_data(mfcc_stats_dict["87"], max_number_of_sample_87_201, partition_percent=.8)
mfcc_train_201, mfcc_test_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_87_201, partition_percent=.8)

training_label_key_87_201 = [len(mfcc_train_87), len(mfcc_train_201)]

training_87_201 = np.vstack([mfcc_train_87, mfcc_train_201])

kmeans_87_201 = KMeans(n_clusters=2, random_state=42)
kmeans_87_201.fit(training_87_201)
labels_87_201 = kmeans_87_201.labels_

separated_labels_train_87_201 = separate_labels(labels_87_201, training_label_key_87_201)
format_percentages(calculate_percentage(separated_labels_train_87_201))

###################
#   same gender   #
# 201 man 311 man #
###################

max_number_of_sample_201_311 = 79 # for 201 and 311 
print(f"\nmax number of sampls 201 & 311 : {max_number_of_sample_201_311}")

mfcc_train_201, mfcc_test_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_201_311, partition_percent=.8)
mfcc_train_311, mfcc_test_311 = partition_data(mfcc_stats_dict["311"], max_number_of_sample_201_311, partition_percent=.8)

training_label_key_201_311 = [len(mfcc_train_201), len(mfcc_train_311)]

training_201_311 = np.vstack([mfcc_train_201, mfcc_train_311])

kmeans_201_311 = KMeans(n_clusters=2, random_state=42)
kmeans_201_311.fit(training_201_311)
labels_201_311 = kmeans_201_311.labels_

separated_labels_train_201_311 = separate_labels(labels_201_311, training_label_key_201_311)
format_percentages(calculate_percentage(separated_labels_train_201_311))

max number of sampls 87 & 201: 79
Speaker 1:
	0: 95.24%, 1: 4.76%
Speaker 2:
	1: 80.95%, 0: 19.05%

max number of sampls 201 & 311 : 79
Speaker 1:
	1: 50.79%, 0: 49.21%
Speaker 2:
	1: 77.78%, 0: 22.22%


In [119]:
testing_label_key_87_201 = [len(mfcc_test_87), len(mfcc_test_201)]
testing_87_201 = np.vstack([mfcc_test_87, mfcc_test_201])

labels_prediction_87_201 = kmeans_87_201.predict(testing_87_201)

separated_labels_test_87_201 = separate_labels(labels_prediction_87_201, testing_label_key_87_201)
print("87 and 201")
format_percentages(calculate_percentage(separated_labels_test_87_201))


testing_label_key_201_311 = [len(mfcc_test_201), len(mfcc_test_311)]
testing_201_311 = np.vstack([mfcc_test_201, mfcc_test_311])

labels_prediction_201_311 = kmeans_201_311.predict(testing_201_311)

separated_labels_test_201_311 = separate_labels(labels_prediction_201_311, testing_label_key_201_311)
print("\n201 and 311")
format_percentages(calculate_percentage(separated_labels_test_201_311))


87 and 201
Speaker 1:
	0: 100.00%
Speaker 2:
	1: 68.75%, 0: 31.25%

201 and 311
Speaker 1:
	0: 62.50%, 1: 37.50%
Speaker 2:
	1: 87.50%, 0: 12.50%


In [121]:
ground_truth_87_201 = create_truth_list(len(labels_prediction_87_201), 0)

tn_87_201, fp_87_201, fn_87_201, tp_87_201 = confusion_matrix(ground_truth_87_201, labels_prediction_87_201).ravel()
f1_87_201 = f1_score(ground_truth_87_201, labels_prediction_87_201)

print("87 and 201")
print("True Positives (TP):", tp_87_201)
print("True Negatives (TN):", tn_87_201)
print("False Positives (FP):", fp_87_201)
print("False Negatives (FN):", fn_87_201)

print(f"F1-Score: {f1_87_201}")

ground_truth_201_311 = create_truth_list(len(labels_prediction_201_311), 0)

tn_201_311, fp_201_311, fn_201_311, tp_201_311 = confusion_matrix(ground_truth_201_311, labels_prediction_201_311).ravel()
f1_201_311 = f1_score(ground_truth_201_311, labels_prediction_201_311)

print("\n201 and 311")
print("True Positives (TP):", tp_201_311)
print("True Negatives (TN):", tn_201_311)
print("False Positives (FP):", fp_201_311)
print("False Negatives (FN):", fn_201_311)

print(f"F1-Score: {f1_201_311}")


87 and 201
True Positives (TP): 11
True Negatives (TN): 16
False Positives (FP): 0
False Negatives (FN): 5
F1-Score: 0.8148148148148148

201 and 311
True Positives (TP): 14
True Negatives (TN): 10
False Positives (FP): 6
False Negatives (FN): 2
F1-Score: 0.7777777777777778


### OLD (garb)

In [346]:
with open("../../data/extracted_features/mfccs_stats/mfcc_stats.pickle", "rb") as file:
    mfccs_dict = pickle.load(file)

for reader in mfccs_dict.keys():
    print(f"reader: {reader}")
    for chapter in mfccs_dict[reader].keys():
        print(f"    chapter: {chapter}")

reader: 19
    chapter: 198
    chapter: 227
reader: 26
    chapter: 495
    chapter: 496
reader: 27
    chapter: 123349
    chapter: 124992
reader: 32
    chapter: 21625
    chapter: 21631
    chapter: 21634
    chapter: 4137
reader: 39
    chapter: 121914
    chapter: 121915
    chapter: 121916
reader: 40
    chapter: 121026
    chapter: 222
reader: 60
    chapter: 121082
reader: 78
    chapter: 368
    chapter: 369
reader: 83
    chapter: 11691
    chapter: 3054
    chapter: 9960
reader: 87
    chapter: 121553


In [416]:
def partition_data(speaker, partition_percent=.80):
    chapters = []
    for chapter in speaker.keys():
        chapters.extend(speaker[chapter])

    # chapters = chapters[0:max]

    partition_index = round(len(chapters) * partition_percent)
    train = chapters[0:partition_index] 
    test =  chapters[partition_index : len(chapters)]
    
    return train, test

def get_lowest_data(speakers):
    sample_numbers = []
    for speaker in speakers:
        data_points = 0
        for chapter in speaker.keys():
            data_points += len(speaker[chapter])
        sample_numbers.append(data_points)
    return min(sample_numbers)

def shorten_lists(test_samples):
    shorten_to = min([len(sample) for sample in test_samples])
    return [sample[0:shorten_to] for sample in test_samples], shorten_to

def separate_labels(labels, speaker_keys):
    separated_labels = []
    current_index = 0
    for key in speaker_keys:
        separated_labels.append(labels[current_index:current_index+key])
        current_index += key
    return(separated_labels)

def calculate_percentage(data):
    percentages = []
    for sublist in data:
        total_count = len(sublist)
        if total_count == 0:
            percentages.append({})
            continue
        
        count_dict = {}
        for num in sublist:
            count_dict[num] = count_dict.get(num, 0) + 1
        
        percentage_dict = {num: (count / total_count) * 100 for num, count in count_dict.items()}
        percentages.append(percentage_dict)
    
    return percentages

def format_percentages(percentages):
    for i, percentage_dict in enumerate(percentages):
        if not percentage_dict:
            print(f"Sublist {i + 1}: No data")
            continue
        
        # Sort by percentage in descending order
        sorted_percentages = sorted(percentage_dict.items(), key=lambda x: x[1], reverse=True)
        
        # Format and print each sublist
        formatted_str = f"Speaker {i + 1}:\n\t" + ", ".join(
            f"{num}: {percent:.2f}%" for num, percent in sorted_percentages
        )
        print(formatted_str)

In [417]:
# min_data_points = get_lowest_data(mfccs_dict["19"],mfccs_dict["26"], mfccs_dict["27"], mfccs_dict["32"],
#                                   mfccs_dict["39"], mfccs_dict["40"], mfccs_dict["60"], mfccs_dict["78"], 
#                                   mfccs_dict["83"], mfccs_dict["87"])

train_mfcc_1, test_mfcc_1 = partition_data(mfccs_dict["19"])
train_mfcc_2, test_mfcc_2 = partition_data(mfccs_dict["26"])
train_mfcc_3, test_mfcc_3 = partition_data(mfccs_dict["27"])
train_mfcc_4, test_mfcc_4 = partition_data(mfccs_dict["32"])
train_mfcc_5, test_mfcc_5 = partition_data(mfccs_dict["39"])
train_mfcc_6, test_mfcc_6 = partition_data(mfccs_dict["40"])
train_mfcc_7, test_mfcc_7 = partition_data(mfccs_dict["60"])
train_mfcc_8, test_mfcc_8 = partition_data(mfccs_dict["78"])
train_mfcc_9, test_mfcc_9 = partition_data(mfccs_dict["83"])
train_mfcc_10, test_mfcc_10 = partition_data(mfccs_dict["87"])

training_speaker_key = [len(train_mfcc_1), len(train_mfcc_2), len(train_mfcc_3), len(train_mfcc_4), len(train_mfcc_5), len(train_mfcc_6),
                        len(train_mfcc_7), len(train_mfcc_8), len(train_mfcc_9), len(train_mfcc_10)]

# print(training_speaker_key)

training = np.vstack([train_mfcc_1, train_mfcc_2, train_mfcc_3, train_mfcc_4, train_mfcc_5, train_mfcc_6,
                     train_mfcc_6, train_mfcc_8, train_mfcc_9, train_mfcc_10])


kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(training)
labels = kmeans.labels_

separated_labels_train = separate_labels(labels, training_speaker_key)

separated_labels_check = [len(sep_list) for sep_list in separated_labels_train]
# print(separated_labels_check)


format_percentages(calculate_percentage(separated_labels_train))

Speaker 1:
	8: 100.00%
Speaker 2:
	2: 96.81%, 4: 3.19%
Speaker 3:
	4: 96.36%, 0: 2.73%, 6: 0.91%
Speaker 4:
	6: 61.70%, 0: 37.23%, 3: 1.06%
Speaker 5:
	0: 51.02%, 6: 48.98%
Speaker 6:
	1: 50.55%, 5: 26.37%, 7: 23.08%
Speaker 7:
	1: 42.31%, 5: 30.77%, 7: 26.92%
Speaker 8:
	4: 81.91%, 1: 13.83%, 6: 4.26%
Speaker 9:
	9: 85.71%, 4: 11.22%, 0: 1.02%, 6: 1.02%, 7: 1.02%
Speaker 10:
	3: 83.72%, 9: 11.63%, 5: 2.33%, 7: 1.16%, 6: 1.16%


In [418]:
test_samples, shorten_to = shorten_lists([test_mfcc_1, test_mfcc_2, test_mfcc_3, test_mfcc_4, test_mfcc_5, test_mfcc_6, test_mfcc_7,
                                          test_mfcc_8, test_mfcc_9, test_mfcc_10])

testing = np.vstack(test_samples)
predicted_labels = kmeans.predict(testing)

testing_speaker_key = [shorten_to for _ in range(0,len(test_samples))]
separated_labels_test = separate_labels(predicted_labels, testing_speaker_key)

format_percentages(calculate_percentage(separated_labels_test))

Speaker 1:
	8: 100.00%
Speaker 2:
	2: 94.74%, 4: 5.26%
Speaker 3:
	4: 89.47%, 6: 10.53%
Speaker 4:
	0: 100.00%
Speaker 5:
	0: 94.74%, 6: 5.26%
Speaker 6:
	1: 100.00%
Speaker 7:
	4: 89.47%, 7: 5.26%, 5: 5.26%
Speaker 8:
	4: 89.47%, 6: 10.53%
Speaker 9:
	9: 89.47%, 5: 5.26%, 7: 5.26%
Speaker 10:
	3: 100.00%
