In [2]:
# load packages
import librosa
import os
import pandas as pd
import numpy

In [3]:
# Function to apply Bark scale
def apply_bark_scale(power_spectrum):
    # Perform Bark scale transformation on the power spectrum
    # You need to implement the specific Bark scale transformation
    # For simplicity, let's use a linear transformation as a placeholder
    bark_scale_spectrum = np.sqrt(power_spectrum)
    return bark_scale_spectrum

# Function for critical-band analysis


def apply_critical_band_analysis(bark_scale_spectrum):
    """
    Apply critical-band analysis to the Bark scale spectrum.

    Parameters:
    - bark_scale_spectrum: numpy array, the input Bark scale spectrum.

    Returns:
    - critical_band_result: numpy array, the result after critical-band analysis.
    """
    omega_values = np.arange(-1.3, 2.6, 0.1)
    critical_band_result = np.zeros_like(bark_scale_spectrum)

    # Apply the critical-band curve to the Bark scale spectrum
    for i in range(len(omega_values) - 1):
        mask = (omega_values[i] <= bark_scale_spectrum) & (bark_scale_spectrum <= omega_values[i+1])
        if omega_values[i] < -0.5:
            critical_band_result[mask] = 10**(2.5 * (bark_scale_spectrum[mask] + 0.5))
        elif -0.5 < omega_values[i] < 0.5:
            critical_band_result[mask] = 1
        elif 0.5 <= omega_values[i] <= 2.5:
            critical_band_result[mask] = 10**(-1.0 * (bark_scale_spectrum[mask] - 0.5))

    return critical_band_result


# Function for equal-loudness preemphasis
def equal_loudness_preemphasis(bark_scale_spectrum):
    # Implement equal-loudness preemphasis
    # You might need to adjust the parameters according to your needs
    # For simplicity, let's use a linear transformation as a placeholder
    return bark_scale_spectrum

# Function for power-law intensity transformation
def power_law_intensity_transformation(bark_scale_spectrum):
    # Apply power-law transformation (y = x^(1/3))
    return bark_scale_spectrum**(1/3)


In [5]:
def rasta_plp_feature_extraction(signal, sr):
    # Perform RASTA-PLP feature extraction
    # Add the RASTA-PLP steps here
    
    # Set frame size to 20 ms
    frame_size_ms = 20
    frame_size_samples = int((frame_size_ms / 1000) * sr)

    # Set hop length to half of the frame size (50% overlap)
    hop_length = frame_size_samples // 2

    # Manually perform framing and windowing
    num_frames = 1 + (len(signal) - frame_size_samples) // hop_length
    frames = np.stack([signal[i * hop_length:i * hop_length + frame_size_samples] * np.hamming(frame_size_samples) for i in range(num_frames)])

    # Continue with the remaining RASTA-PLP steps
    power_spectrum = np.abs(np.fft.fft(frames, axis=0))**2
    bark_scale_spectrum = apply_bark_scale(power_spectrum)
    critical_band_result = apply_critical_band_analysis(bark_scale_spectrum)
    preemphasis_result = equal_loudness_preemphasis(critical_band_result)
    intensity_transformed = power_law_intensity_transformation(preemphasis_result)
    
    # For simplicity, let's use the mean as a summary statistic for each feature
    rasta_plp_features = np.mean(intensity_transformed, axis=1)
    
    return rasta_plp_features






In [None]:
import os
import librosa
import pandas as pd
import numpy as np



# Directory containing the .wav files
directory = "D:/OneDrive - TVS Motor Company Ltd/Desktop/demo/vox_indian/archive/vox1_indian/content/combined_wav/"

# Create an empty DataFrame
columns = ["Names"] + [f"MFCC_{i+1}" for i in range(13)] + ["Chroma", "SpectralContrast", "Tonnetz", "ZeroCrossingRate", "RMSEnergy"] + [f"LogMelFilterbank_{i+1}" for i in range(30)] + ["RASTA-PLP"]
df_combined = pd.DataFrame(columns=columns)

# Iterate through each .wav file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(directory, filename)
        audio, sr = librosa.load(file_path, sr=44100)
        sr = int(sr)
        
        # Calculate the total number of clips
        clip_duration = 5
        clip_samples = int(sr * clip_duration)
        total_clips = len(audio) // clip_samples

        # Create a temporary DataFrame for the current file
        df_temp = pd.DataFrame(columns=columns)
        df_temp["Names"] = [filename.split("_")[0]] * total_clips

        # Split the audio into clips and extract features for each clip
        for i in range(total_clips):
            clip_start = i * clip_samples
            clip_end = (i + 1) * clip_samples
            clip = audio[clip_start:clip_end]

            # Extract features similar to the Arunanshu code
            mfccs = librosa.feature.mfcc(y=clip, sr=sr, n_mfcc=13, hop_length=512, n_fft=2048)
            mfccs_flattened = mfccs.mean(axis=1)

            chroma = librosa.feature.chroma_stft(y=clip, sr=sr)
            contrast = librosa.feature.spectral_contrast(y=clip, sr=sr)
            tonnetz = librosa.feature.tonnetz(y=clip, sr=sr)
            zero_crossings = librosa.feature.zero_crossing_rate(y=clip)
            rms_energy = librosa.feature.rms(y=clip)
            mel_filterbank_energies = librosa.feature.melspectrogram(y=clip, sr=sr, n_mels=30)
            log_mel_filterbank_energies = librosa.power_to_db(mel_filterbank_energies)
            log_mel_filterbank_energies_flattened = log_mel_filterbank_energies.mean(axis=1).tolist()
            rasta_plp_features = rasta_plp_feature_extraction(clip, sr)

            # Concatenate features into a single row
            row_values = [filename.split("_")[0]] + mfccs_flattened.tolist() + [chroma.mean(), contrast.mean(), tonnetz.mean(), zero_crossings.mean(), rms_energy.mean()] + log_mel_filterbank_energies_flattened + [np.mean(rasta_plp_features.tolist())]

            # Append the row to the temporary DataFrame
            df_temp = df_temp._append(pd.Series(row_values, index=df_temp.columns), ignore_index=True)

        # Append the temporary DataFrame to the main DataFrame
        df_combined = pd.concat([df_combined, df_temp], ignore_index=True)

# Display the resulting DataFrame
print(df_combined)


In [10]:
df_combined_clean=df_combined.dropna()

In [11]:
df_combined_clean

Unnamed: 0,Names,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,...,LogMelFilterbank_22,LogMelFilterbank_23,LogMelFilterbank_24,LogMelFilterbank_25,LogMelFilterbank_26,LogMelFilterbank_27,LogMelFilterbank_28,LogMelFilterbank_29,LogMelFilterbank_30,RASTA-PLP
163,id10002,-333.988403,198.076279,-21.218788,31.866308,7.915572,-14.415905,18.010479,-15.305111,-9.232506,...,-32.353176,-40.266621,-59.854279,-60.529491,-60.537827,-60.545662,-60.554153,-60.563507,-60.571526,0.809151
164,id10002,-359.817291,204.077286,-3.572416,29.303961,11.496339,-9.439104,14.578316,-12.501418,-7.912405,...,-36.916328,-45.259090,-61.761429,-61.924465,-61.930775,-61.936615,-61.941788,-61.946014,-61.948929,0.818801
165,id10002,-374.040192,197.717300,-2.335147,27.765970,14.599997,-9.144268,9.745966,-6.746390,0.530692,...,-39.183556,-46.259701,-62.149433,-62.894840,-62.901131,-62.906990,-62.912178,-62.916428,-62.919350,0.878388
166,id10002,-379.510223,216.345810,22.191475,13.962840,9.699755,-6.566788,1.154472,-6.295723,0.043818,...,-45.123650,-51.940281,-63.282825,-63.520550,-63.524811,-63.528793,-63.532310,-63.535183,-63.537155,0.868064
167,id10002,-384.640442,212.123734,22.858784,12.835580,4.571457,-11.933136,-0.317249,-6.872586,-1.533192,...,-44.610836,-53.167858,-63.198948,-63.559238,-63.561558,-63.563625,-63.565418,-63.566868,-63.567867,0.857904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14005,id11209,-337.465973,166.129166,-11.218781,7.758927,8.866342,-10.791190,-4.712016,-15.092561,-9.327009,...,-37.204773,-41.145096,-55.968815,-56.125526,-56.129459,-56.133148,-56.136448,-56.139156,-56.141033,0.756343
14006,id11209,-345.268066,168.236496,-0.471698,9.516051,11.663962,-3.608157,1.494523,-7.250832,1.646722,...,-38.658726,-43.648914,-55.811321,-55.921947,-55.924099,-55.926048,-55.927757,-55.929157,-55.930122,0.844080
14007,id11209,-366.277344,193.106232,-20.849163,15.253913,15.721986,-4.178594,8.161749,-11.672540,0.931113,...,-39.670300,-44.668858,-62.872074,-63.017632,-63.021942,-63.025833,-63.029259,-63.032055,-63.033981,0.914634
14008,id11209,-351.518524,194.097702,-22.035595,9.975117,13.649301,-3.786163,6.877864,-11.939669,0.976078,...,-38.544174,-43.622482,-62.183296,-62.638187,-62.640377,-62.642380,-62.644131,-62.645565,-62.646553,0.916438


In [13]:
df_combined_clean=df_combined_clean.reset_index(drop=True)
df_combined_clean

Unnamed: 0,Names,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,...,LogMelFilterbank_22,LogMelFilterbank_23,LogMelFilterbank_24,LogMelFilterbank_25,LogMelFilterbank_26,LogMelFilterbank_27,LogMelFilterbank_28,LogMelFilterbank_29,LogMelFilterbank_30,RASTA-PLP
0,id10002,-333.988403,198.076279,-21.218788,31.866308,7.915572,-14.415905,18.010479,-15.305111,-9.232506,...,-32.353176,-40.266621,-59.854279,-60.529491,-60.537827,-60.545662,-60.554153,-60.563507,-60.571526,0.809151
1,id10002,-359.817291,204.077286,-3.572416,29.303961,11.496339,-9.439104,14.578316,-12.501418,-7.912405,...,-36.916328,-45.259090,-61.761429,-61.924465,-61.930775,-61.936615,-61.941788,-61.946014,-61.948929,0.818801
2,id10002,-374.040192,197.717300,-2.335147,27.765970,14.599997,-9.144268,9.745966,-6.746390,0.530692,...,-39.183556,-46.259701,-62.149433,-62.894840,-62.901131,-62.906990,-62.912178,-62.916428,-62.919350,0.878388
3,id10002,-379.510223,216.345810,22.191475,13.962840,9.699755,-6.566788,1.154472,-6.295723,0.043818,...,-45.123650,-51.940281,-63.282825,-63.520550,-63.524811,-63.528793,-63.532310,-63.535183,-63.537155,0.868064
4,id10002,-384.640442,212.123734,22.858784,12.835580,4.571457,-11.933136,-0.317249,-6.872586,-1.533192,...,-44.610836,-53.167858,-63.198948,-63.559238,-63.561558,-63.563625,-63.565418,-63.566868,-63.567867,0.857904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7000,id11209,-337.465973,166.129166,-11.218781,7.758927,8.866342,-10.791190,-4.712016,-15.092561,-9.327009,...,-37.204773,-41.145096,-55.968815,-56.125526,-56.129459,-56.133148,-56.136448,-56.139156,-56.141033,0.756343
7001,id11209,-345.268066,168.236496,-0.471698,9.516051,11.663962,-3.608157,1.494523,-7.250832,1.646722,...,-38.658726,-43.648914,-55.811321,-55.921947,-55.924099,-55.926048,-55.927757,-55.929157,-55.930122,0.844080
7002,id11209,-366.277344,193.106232,-20.849163,15.253913,15.721986,-4.178594,8.161749,-11.672540,0.931113,...,-39.670300,-44.668858,-62.872074,-63.017632,-63.021942,-63.025833,-63.029259,-63.032055,-63.033981,0.914634
7003,id11209,-351.518524,194.097702,-22.035595,9.975117,13.649301,-3.786163,6.877864,-11.939669,0.976078,...,-38.544174,-43.622482,-62.183296,-62.638187,-62.640377,-62.642380,-62.644131,-62.645565,-62.646553,0.916438


In [None]:
csv_file_path = 'D:/OneDrive - TVS Motor Company Ltd/Desktop/demo/vox_indian/archive/vox1_indian/content/combined_wav.csv'

# Use to_csv() to create the CSV file
df_combined_clean.to_csv(csv_file_path, index=False)

In [6]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'D:/OneDrive - TVS Motor Company Ltd/Desktop/demo/vox_indian/archive/vox1_indian/content/combined_wav.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame


In [7]:
df

Unnamed: 0,Names,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,...,LogMelFilterbank_22,LogMelFilterbank_23,LogMelFilterbank_24,LogMelFilterbank_25,LogMelFilterbank_26,LogMelFilterbank_27,LogMelFilterbank_28,LogMelFilterbank_29,LogMelFilterbank_30,RASTA-PLP
0,id10002,-333.988403,198.076279,-21.218788,31.866308,7.915572,-14.415905,18.010479,-15.305111,-9.232506,...,-32.353176,-40.266621,-59.854279,-60.529491,-60.537827,-60.545662,-60.554153,-60.563507,-60.571526,0.809151
1,id10002,-359.817291,204.077286,-3.572416,29.303961,11.496339,-9.439104,14.578316,-12.501418,-7.912405,...,-36.916328,-45.259090,-61.761429,-61.924465,-61.930775,-61.936615,-61.941788,-61.946014,-61.948929,0.818801
2,id10002,-374.040192,197.717300,-2.335147,27.765970,14.599997,-9.144268,9.745966,-6.746390,0.530692,...,-39.183556,-46.259701,-62.149433,-62.894840,-62.901131,-62.906990,-62.912178,-62.916428,-62.919350,0.878388
3,id10002,-379.510223,216.345810,22.191475,13.962840,9.699755,-6.566788,1.154472,-6.295723,0.043818,...,-45.123650,-51.940281,-63.282825,-63.520550,-63.524811,-63.528793,-63.532310,-63.535183,-63.537155,0.868064
4,id10002,-384.640442,212.123734,22.858784,12.835580,4.571457,-11.933136,-0.317249,-6.872586,-1.533192,...,-44.610836,-53.167858,-63.198948,-63.559238,-63.561558,-63.563625,-63.565418,-63.566868,-63.567867,0.857904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7000,id11209,-337.465973,166.129166,-11.218781,7.758927,8.866342,-10.791190,-4.712016,-15.092561,-9.327009,...,-37.204773,-41.145096,-55.968815,-56.125526,-56.129459,-56.133148,-56.136448,-56.139156,-56.141033,0.756343
7001,id11209,-345.268066,168.236496,-0.471698,9.516051,11.663962,-3.608157,1.494523,-7.250832,1.646722,...,-38.658726,-43.648914,-55.811321,-55.921947,-55.924099,-55.926048,-55.927757,-55.929157,-55.930122,0.844080
7002,id11209,-366.277344,193.106232,-20.849163,15.253913,15.721986,-4.178594,8.161749,-11.672540,0.931113,...,-39.670300,-44.668858,-62.872074,-63.017632,-63.021942,-63.025833,-63.029259,-63.032055,-63.033981,0.914634
7003,id11209,-351.518524,194.097702,-22.035595,9.975117,13.649301,-3.786163,6.877864,-11.939669,0.976078,...,-38.544174,-43.622482,-62.183296,-62.638187,-62.640377,-62.642380,-62.644131,-62.645565,-62.646553,0.916438


In [8]:
# Import label encoder 
from sklearn import preprocessing 

# label_encoder object knows 
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'species'. 
df['Names']= label_encoder.fit_transform(df['Names']) 

df['Names'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [9]:
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [10]:
from sklearn.model_selection import train_test_split

# Assuming X and y are your feature and target variable DataFrames
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
y_train.value_counts()

Names
14    476
8     435
23    389
6     326
21    312
4     309
3     303
13    274
16    272
12    270
20    252
19    238
15    229
9     209
10    190
18    187
5     155
1     148
2     139
0     132
17    117
11    103
7      94
22     45
Name: count, dtype: int64

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Apply PCA for dimensionality reduction to 2D
n_components = 30
pca = PCA(n_components=n_components)

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train_normalized)

# Transform the test data using the same PCA model
X_test_pca = pca.transform(X_test_normalized)

# Plot the 2D representation for training data
# plt.figure(figsize=(10, 6))
# for label in range(24):  # Assuming four classes
#     plt.scatter(X_train_pca[y_train == label, 0], X_train_pca[y_train == label, 1], label=f'Class {label}')

# plt.title('PCA - Training Data')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.legend()
# plt.show()

# # Plot the 2D representation for test data
# plt.figure(figsize=(10, 6))
# for label in range(24):  # Assuming four classes
#     plt.scatter(X_test_pca[y_test == label, 0], X_test_pca[y_test == label, 1], label=f'Class {label}')

# plt.title('PCA - Test Data')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.legend()
# plt.show()

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}
grid_search = GridSearchCV(SVC(decision_function_shape='ovr', random_state=42), param_grid, cv=5)
grid_search.fit(X_train_pca, y_train)

best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_pca)
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display more detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9350463954318344
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        31
           1       0.96      0.94      0.95        50
           2       0.96      0.93      0.95        46
           3       0.94      0.95      0.94        62
           4       0.95      1.00      0.97        76
           5       0.98      0.86      0.92        51
           6       0.97      0.94      0.95        95
           7       0.83      0.77      0.80        13
           8       0.91      0.97      0.94        93
           9       0.91      0.98      0.94        52
          10       0.98      0.98      0.98        45
          11       1.00      0.74      0.85        31
          12       0.88      0.95      0.91        60
          13       0.98      1.00      0.99        65
          14       0.97      0.94      0.95       129
          15       0.86      0.96      0.91        53
          16       0.94     

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train_pca, X_test_pca, y_train, and y_test are available

# Create a Logistic Regression model
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

# Train the model on the training data
logreg.fit(X_train_pca, y_train)

# Make predictions on the test data
y_pred = logreg.predict(X_test_pca)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display more detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8258386866523911
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93        31
           1       0.80      0.74      0.77        50
           2       0.83      0.83      0.83        46
           3       0.94      1.00      0.97        62
           4       0.86      0.86      0.86        76
           5       0.76      0.63      0.69        51
           6       0.94      0.93      0.93        95
           7       0.86      0.92      0.89        13
           8       0.78      0.81      0.79        93
           9       0.81      0.85      0.83        52
          10       0.82      0.80      0.81        45
          11       0.91      0.68      0.78        31
          12       0.72      0.83      0.78        60
          13       0.89      0.91      0.90        65
          14       0.78      0.81      0.79       129
          15       0.89      0.89      0.89        53
          16       0.78     

In [15]:
X_train_pca.shape

(5604, 30)

In [16]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
# Assuming X_train_pca, X_test_pca, y_train, and y_test are available

#Create a GMM classifier
param_grid = {'n_components': [4, 8, 12], 'covariance_type': ['full', 'tied', 'diag'], 'tol': [1e-3, 1e-4]}
grid_search = GridSearchCV(GaussianMixture(init_params='k-means++', random_state=42), param_grid, cv=5)
grid_search.fit(X_train_pca, y_train)
best_gmm = grid_search.best_estimator_
# n_components = 4  # Number of components (classes)
# gmm_classifier = GaussianMixture(n_components=n_components, covariance_type='spherical', init_params='k-means++', random_state=42)

# # Fit the model on the training data
# gmm_classifier.fit(X_train_pca)

# Make predictions on the test data
y_pred = best_gmm.predict(X_test_pca)

# # Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# # Display more detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))
#best_gmm

Accuracy: 0.03283369022127052
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        31
           1       0.00      0.00      0.00        50
           2       0.14      0.41      0.21        46
           3       0.13      0.34      0.19        62
           4       0.00      0.00      0.00        76
           5       0.00      0.00      0.00        51
           6       0.00      0.00      0.00        95
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        93
           9       0.00      0.00      0.00        52
          10       0.03      0.13      0.05        45
          11       0.00      0.00      0.00        31
          12       0.00      0.00      0.00        60
          13       0.00      0.00      0.00        65
          14       0.00      0.00      0.00       129
          15       0.00      0.00      0.00        53
          16       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

# Assuming you have X_train_pca, X_test_pca, y_train, y_test available

# Function to train GMM for each speaker
def train_gmm_for_speakers(X_train_pca, y_train):
    models = {}
    for speaker_label in y_train.unique():
        # Select data for the current speaker
        X_speaker = X_train_pca[y_train == speaker_label]
        
        # Train Gaussian Mixture Model
        

        # param_grid = {'n_components': [4, 8, 12], 'covariance_type': ['full', 'tied', 'diag'], 'tol': [1e-3, 1e-4]}
        # grid_search = GridSearchCV(GaussianMixture(init_params='k-means++', random_state=42), param_grid, cv=5)
        # grid_search.fit(X_speaker, speaker_label)

        # best_gmm = grid_search.best_estimator_
        gmm = GaussianMixture(n_components=4,init_params='k-means++',random_state=42)  # You can adjust n_components as needed
        gmm.fit(X_speaker)
        
        # Save the model for the current speaker
        models[speaker_label] = gmm
    return models

# Function to predict speaker labels using GMM
def predict_speaker_labels(models, X_test_pca):
    predictions = []
    for _, model in models.items():
        # Calculate log likelihood for each speaker model
        log_likelihood =np.abs (model.score_samples(X_test_pca))
        predictions.append(log_likelihood)
    
    # Assign the speaker label with the maximum log likelihood
    # print(predictions[0][0])
    # print(predictions[1][0])
    # print(predictions[2][0])
    # print(predictions[3][0])
    predicted_labels = np.argmax((predictions), axis=0)
    #print(predicted_labels)
    return predicted_labels

# Train GMM models for each speaker
gmm_models = train_gmm_for_speakers(X_train_pca, y_train)

# Predict speaker labels for test data
predicted_labels = predict_speaker_labels(gmm_models, X_test_pca)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, predicted_labels))

Accuracy: 6.71%
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        31
           1       0.00      0.00      0.00        50
           2       0.00      0.00      0.00        46
           3       0.00      0.00      0.00        62
           4       0.00      0.00      0.00        76
           5       0.00      0.00      0.00        51
           6       0.00      0.00      0.00        95
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        93
           9       0.00      0.00      0.00        52
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00        31
          12       0.00      0.00      0.00        60
          13       0.00      0.00      0.00        65
          14       0.00      0.00      0.00       129
          15       0.00      0.00      0.00        53
          16       0.00      0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.mixture import GaussianMixture
import numpy as np
import pandas as pd
# Assuming you have X_train_pca, X_test_pca, y_train, y_test available
def train_ubm(X_train_pca):
    ubm = GaussianMixture(n_components=4,init_params='random',random_state=42)  # You can adjust n_components as needed
    ubm.fit(X_train_pca)
    return ubm

def adapt_speaker_model(speaker_data, ubm):
    # Bayesian adaptation
    weights = ubm.weights_
    means = ubm.means_
    covariances = ubm.covariances_
    #print(means)

    # Reshape the means array to ensure it is treated as a 1D array
    supervector = means.flatten()

    # Adapt the speaker model
    adapted_model = GaussianMixture(n_components=4,init_params='random',random_state=42)
    adapted_model.weights_ = weights
    adapted_model.means_ = np.array([supervector])  # Use the reshaped supervector
   
    adapted_model.covariances_ = covariances

    # Train the adapted model with speaker data
    adapted_model.fit(speaker_data)
    

    return adapted_model

def predict_speaker_labels(models, X_test_pca):
    predictions = []
    for model in models:
        # Calculate log likelihood for each speaker model
        log_likelihood = np.abs(model.score_samples(X_test_pca))
        predictions.append(log_likelihood)

    # Assign the speaker label with the maximum log likelihood
    predicted_labels = np.argmax(predictions, axis=0)
    return predicted_labels

# Train the UBM
ubm = train_ubm(X_train_pca)

# Adapt individual speaker models
speaker_models = []
for speaker_label in y_train.unique():
    speaker_data = X_train_pca[y_train == speaker_label]
    adapted_model = adapt_speaker_model(speaker_data, ubm)
    speaker_models.append(adapted_model)

# Predict speaker labels for test data
predicted_labels = predict_speaker_labels(speaker_models, X_test_pca)
# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 6.64%


In [19]:
y_test.shape

(1401,)

Best for GMM supervector linear kernel

In [20]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import accuracy_score

# Assuming you have X_train_pca, X_test_pca, y_train, y_test available

# Function to train GMM for each speaker
def train_gmm_for_speakers(X_train_pca, y_train):
    models = {}
    for speaker_label in np.unique(y_train):
        # Select data for the current speaker
        X_speaker = X_train_pca[y_train == speaker_label]
        
        # Train Gaussian Mixture Model
        gmm = GaussianMixture(n_components=1, covariance_type='full')  # You can adjust n_components as needed
        gmm.fit(X_speaker)
        
        # Save the model for the current speaker
        models[speaker_label] = gmm
    return models

# Function to compute the supervector linear kernel
def supervector_linear_kernel(models, X_a, X_b):
    kernel_matrix = np.zeros((X_a.shape[0], X_b.shape[0]))
    for label, model in models.items():
        # Extract relevant parameters from the trained GMM
        weights = model.weights_
        means = model.means_
        covariances_inv = np.linalg.inv(model.covariances_[0])  # Assuming one component per GMM
        
        # Compute the kernel matrix using supervector linear kernel
        kernel_matrix += weights[0] * linear_kernel(X_a @ covariances_inv @ means.T, X_b @ covariances_inv @ means.T)
    
    return kernel_matrix

# Train GMM models for each speaker
gmm_models = train_gmm_for_speakers(X_train_pca, y_train)

# Compute the kernel matrix for training and testing data
train_kernel_matrix = supervector_linear_kernel(gmm_models, X_train_pca, X_train_pca)
test_kernel_matrix = supervector_linear_kernel(gmm_models, X_test_pca, X_train_pca)

# Train your classifier (e.g., SVM) using the kernel matrix

# Here, you can use any classifier that supports kernel matrices
# For example, using SVM as a classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='precomputed')
svm_classifier.fit(train_kernel_matrix, y_train)

# Predict speaker labels for test data
predicted_labels = svm_classifier.predict(test_kernel_matrix)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 84.94%


In [24]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import accuracy_score

# Assuming you have X_train_pca, X_test_pca, y_train, y_test available

# Function to train GMM for each speaker
def train_gmm_for_speakers(X_train_pca, y_train, n_components=1):
    models = {}
    for speaker_label in np.unique(y_train):
        # Select data for the current speaker
        X_speaker = X_train_pca[y_train == speaker_label]

        # Train Gaussian Mixture Model
        gmm = GaussianMixture(n_components=n_components, covariance_type='full')
        gmm.fit(X_speaker)

        # Save the model for the current speaker
        models[speaker_label] = gmm
    return models

# Function to compute the supervector linear kernel
# Function to compute the supervector linear kernel
def supervector_linear_kernel(models, X_a, X_b):
    kernel_matrix = np.zeros((X_a.shape[0], X_b.shape[0]))
    for label, model in models.items():
        # Extract relevant parameters from the trained GMM
        weights = model.weights_
        means = model.means_
        covariances = model.covariances_

        # Compute the kernel matrix using supervector linear kernel
        for i in range(len(weights)):
            covariances_inv = np.linalg.inv(covariances[i])
            kernel_matrix += weights[i] * np.outer(X_a @ covariances_inv @ means[i].T, X_b @ covariances_inv @ means[i].T)

    return kernel_matrix

# Train GMM models for each speaker
gmm_models = train_gmm_for_speakers(X_train_pca, y_train, n_components=3)  # Adjust n_components as needed

# Compute the kernel matrix for training and testing data
train_kernel_matrix = supervector_linear_kernel(gmm_models, X_train_pca, X_train_pca)
test_kernel_matrix = supervector_linear_kernel(gmm_models, X_test_pca, X_train_pca)

# Train your classifier (e.g., SVM) using the kernel matrix

# Here, you can use any classifier that supports kernel matrices
# For example, using SVM as a classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='precomputed')
svm_classifier.fit(train_kernel_matrix, y_train)

# Predict speaker labels for test data
predicted_labels = svm_classifier.predict(test_kernel_matrix)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
