<a href="https://colab.research.google.com/github/Baias-Antonio/My_work/blob/main/testing_HibridModel_newdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
from tqdm import tqdm
import torch
import torchaudio
from transformers import HubertForSequenceClassification, Wav2Vec2Processor
from google.colab import drive
from torch.utils.data import Dataset
import torch.nn as nn
from transformers import HubertPreTrainedModel
# Mount Google Drive
drive.mount('/content/drive')
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional LSTM

    def forward(self, x):
        out, _ = self.lstm(x)
        out = torch.mean(out, dim=1)  # Use mean across the time dimension
        out = self.fc(out)
        return out

# Define the unlabeled dataset class
class UnlabeledAudioDataset(Dataset):
    def __init__(self, file_paths, processor):
        self.file_paths = file_paths
        self.processor = processor

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]

        # Utilize torchaudio to load the audio file and convert to mono
        waveform, sample_rate = torchaudio.load(file_path, normalize=True)

        # Ensure the audio is in mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample audio to the correct sampling rate (16000 Hz)
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

        # Check if waveform has non-zero length
        if waveform.shape[1] > 0:
            # Process the waveform using the Wav2Vec2 processor
            speech = self.processor(waveform[0].numpy(), padding="max_length", truncation=True, max_length=6*16000,
                                    return_tensors="pt", sampling_rate=16000).input_values.squeeze(0)

            return speech
        else:
            return None

# Define the path to the saved model
model_load_path = '/content/drive/My Drive/best_modeltransformerlstm0608.pth'
class HubertWithLSTMForSpeechClassification(HubertPreTrainedModel):
    def __init__(self, config, lstm_hidden_size=256, num_classes=8):
        super().__init__(config)
        self.hubert = HubertModel(config)
        self.lstm_model = LSTMModel(config.hidden_size, lstm_hidden_size, num_layers=1, num_classes=num_classes)
        self.init_weights()

    def forward(self, x):
        outputs = self.hubert(x)
        hidden_states = outputs.last_hidden_state
        x = self.lstm_model(hidden_states)
        return x

# Load the pre-trained Hubert model for speech emotion recognition
fine_tuned_model_with_lstm = HubertWithLSTMForSpeechClassification.from_pretrained(
    pretrained_model_name_or_path="facebook/hubert-large-ls960-ft",
    config=config
)
# Load the saved model
loaded_model = HubertWithLSTMForSpeechClassification.from_pretrained(
    pretrained_model_name_or_path="facebook/hubert-large-ls960-ft",
    config=config
)
loaded_model.load_state_dict(torch.load(model_load_path))
loaded_model.eval()

# Define the directory containing unlabeled audio files
unlabeled_audio_dir =  '/content/drive/My Drive/Audio_Speech_Actors_01-24/Actor_07'


# Get paths to unlabeled audio files
unlabeled_audio_paths = [os.path.join(unlabeled_audio_dir, file) for file in os.listdir(unlabeled_audio_dir) if file.endswith(".wav")]

# Create dataset for unlabeled data
unlabeled_dataset = UnlabeledAudioDataset(unlabeled_audio_paths, processor)

# Make predictions on unlabeled data
predictions = []
with torch.no_grad():
    for inputs in tqdm(unlabeled_dataset, desc='Making Predictions on Unlabeled Data'):
        outputs = loaded_model(inputs.unsqueeze(0))
        _, predicted_label = torch.max(outputs, 1)
        predictions.append(predicted_label.item())

# Print or save predictions
print(predictions)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of HubertWithLSTMForSpeechClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lstm_model.fc.bias', 'lstm_model.fc.weight', 'lstm_model.lstm.bias_hh_l0', 'lstm_model.lstm.bias_hh_l0_reverse', 'lstm_model.lstm.bias_ih_l0', 'lstm_model.lstm.bias_ih_l0_reverse', 'lstm_model.lstm.weight_hh_l0', 'lstm_model.lstm.weight_hh_l0_reverse', 'lstm_model.lstm.weight_ih_l0', 'lstm_model.lstm.weight_ih_l0_reverse']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of HubertWithLSTMForSpeechClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_c

[7, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 6, 4, 1, 2, 3, 3, 3, 2, 3, 3, 3, 4, 5, 5, 5, 4, 2, 4, 7, 4, 4, 7, 4, 5, 4, 7, 6, 7, 6, 7, 7, 6, 6, 7, 6, 6, 7, 6, 6, 7, 7]



