In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

# Load the CSV file
csv_file_path = 'concatenated_mfccs_HC.csv'
#csv_file_path = 'concatenated_mfccs_MDD.csv'  
mfcc_data = pd.read_csv(csv_file_path)

# Normalize the MFCCs
scaler = StandardScaler()
mfcc_data_normalized = scaler.fit_transform(mfcc_data)

# Set the number of clusters (audio words)
num_clusters = 100  # This can be adjusted based on your data and requirements

# Apply k-means clustering to create the audio words vocabulary
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, batch_size=100)
kmeans.fit(mfcc_data_normalized)

# Assign each MFCC vector to the nearest cluster center (audio word)
word_assignments = kmeans.predict(mfcc_data_normalized)

# Generate the BoAW histogram for each audio file
boaw_features = np.zeros((len(mfcc_data), num_clusters))
for i, word in enumerate(word_assignments):
    boaw_features[i, word] += 1

# Convert to DataFrame for easier handling
boaw_df = pd.DataFrame(boaw_features)

# Save the BoAW presentations to a new CSV file
boaw_df.to_csv('concatenated_HC_1000_BoAW_MFCC_with_labels.csv', index=False)
# boaw_df.to_csv('concatenated_MDD_1000_BoAW_MFCC_with_labels.csv', index=False)

print("HC_BoAW presentations generated and saved successfully.")
# print("MDD_BoAW presentations generated and saved successfully.")
