In [1]:
import sys
import os
import subprocess

# Construct the path to the user's site-packages directory
user_site_packages = os.path.join(os.getenv('APPDATA'), 'Python', 'Python313', 'site-packages')

# Add the path to sys.path if it exists and is not already there
if os.path.exists(user_site_packages) and user_site_packages not in sys.path:
    sys.path.append(user_site_packages)

# Now, run the installation for any packages that might still be missing
subprocess.check_call([sys.executable, "-m", "pip", "install", "torch", "torchaudio", "transformers[torch]", "scikit-learn", "seaborn", "matplotlib", "tqdm", "pandas", "ipywidgets"])

0

In [2]:
import os
import torch
import torchaudio
import numpy as np
import pickle
from tqdm.notebook import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [3]:
# --- Configuration ---
TRAIN_PATH = r"E:\Emotion voice recognition\dataset\train_sorted"
TEST_PATH = r"E:\Emotion voice recognition\dataset\test_sorted"
MODEL_NAME = "facebook/wav2vec2-base"
NUM_EPOCHS = 10 # More epochs for the classifier is fine, it's fast
BATCH_SIZE = 8
LEARNING_RATE = 5e-4 # Can be a bit higher for the classifier

# Check for GPU availability and set the device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


### 1. Prepare Labels and Datasets

In [4]:
# --- Label Encoding ---
print("Fitting LabelEncoder...")
train_labels = [d for d in os.listdir(TRAIN_PATH) if os.path.isdir(os.path.join(TRAIN_PATH, d))]
test_labels = [d for d in os.listdir(TEST_PATH) if os.path.isdir(os.path.join(TEST_PATH, d))]
all_labels = sorted(list(set(train_labels + test_labels)))

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
print(f"LabelEncoder fitted on classes: {label_encoder.classes_}")

NUM_LABELS = len(label_encoder.classes_)

# --- Dataset Class ---
class EmotionDataset(Dataset):
    def __init__(self, root_folder, processor, label_encoder):
        self.audio_paths = []
        self.labels = []
        self.processor = processor
        self.label_encoder = label_encoder
        
        for label in os.listdir(root_folder):
            label_folder = os.path.join(root_folder, label)
            if os.path.isdir(label_folder):
                for file in os.listdir(label_folder):
                    if file.endswith(".wav"):
                        self.audio_paths.append(os.path.join(label_folder, file))
                        self.labels.append(label)
                        
        self.encoded_labels = self.label_encoder.transform(self.labels)

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        path = self.audio_paths[idx]
        encoded_label = self.encoded_labels[idx]
        waveform, sr = torchaudio.load(path)
        
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform = resampler(waveform)
            
        waveform = waveform.squeeze().numpy()
        input_values = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
        return input_values.squeeze(0), torch.tensor(encoded_label, dtype=torch.long)

Fitting LabelEncoder...
LabelEncoder fitted on classes: ['Angry' 'Disgust' 'Fear' 'Happy' 'Neutral' 'Sad']


### 2. Define the Model with a Frozen Base

In [5]:
class Wav2Vec2Classifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        # Load the base wav2vec2 model
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(MODEL_NAME)
        
        # Freeze the base model's parameters
        for param in self.wav2vec2.parameters():
            param.requires_grad = False
            
        # Define the classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.wav2vec2.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_labels)
        )

    def forward(self, x):
        # Pass input through the frozen base model to get embeddings
        # No gradient calculation is needed for this part
        with torch.no_grad():
            outputs = self.wav2vec2(x).last_hidden_state
        
        # Average the embeddings across the sequence length
        mean_embeds = outputs.mean(dim=1)
        
        # Pass the averaged embeddings through the classifier
        logits = self.classifier(mean_embeds)
        return logits

# Initialize the model and move it to the correct device
model = Wav2Vec2Classifier(num_labels=NUM_LABELS).to(DEVICE)

# Note: We will load your partially trained model's weights in the next step



### 3. Load Data and (Optional) Partially Trained Weights

In [6]:
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)

# --- Load your partially trained model if it was saved ---
# If you managed to save your model, uncomment the following lines:
# partially_trained_state_dict = torch.load('path_to_your_saved_model.pth')
# model.wav2vec2.load_state_dict(partially_trained_state_dict)
# print("Successfully loaded partially trained weights into the model's base.")

train_dataset = EmotionDataset(TRAIN_PATH, processor, label_encoder)
test_dataset = EmotionDataset(TEST_PATH, processor, label_encoder)

def collate_fn(batch):
    input_values = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    padded_inputs = pad_sequence(input_values, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    labels = torch.stack(labels)
    return padded_inputs, labels

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

### 4. Train ONLY the Classifier Head

In [7]:
# We only pass the classifier's parameters to the optimizer
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(NUM_EPOCHS):
    model.train()
    # But only the classifier part is in training mode, the base is frozen
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    for batch in progress_bar:
        inputs, labels = batch
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()

        logits = model(inputs)
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Classifier Loss: {avg_loss:.4f}")

Epoch 1/10:   0%|          | 0/816 [00:00<?, ?it/s]



Epoch 1 - Average Classifier Loss: 1.4693


Epoch 2/10:   0%|          | 0/816 [00:00<?, ?it/s]

Epoch 2 - Average Classifier Loss: 1.3489


Epoch 3/10:   0%|          | 0/816 [00:00<?, ?it/s]

Epoch 3 - Average Classifier Loss: 1.3075


Epoch 4/10:   0%|          | 0/816 [00:00<?, ?it/s]

KeyboardInterrupt: 

### 5. Evaluate the Model

In [None]:
def evaluate_model(loader, model, data_name):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Evaluating on {data_name} data"):
            inputs, labels = batch
            inputs = inputs.to(DEVICE)
            
            logits = model(inputs)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            y_pred.extend(preds)
            y_true.extend(labels.numpy())

    y_true_decoded = label_encoder.inverse_transform(y_true)
    y_pred_decoded = label_encoder.inverse_transform(y_pred)
    class_names = label_encoder.classes_

    print(f"--- Classification Report for {data_name} Data ---")
    print(classification_report(y_true_decoded, y_pred_decoded, target_names=class_names, zero_division=0))

    cm = confusion_matrix(y_true_decoded, y_pred_decoded, labels=class_names)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {data_name} Data')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

evaluate_model(test_loader, model, "Test")

### 6. Save the Final Model and Label Encoder

In [None]:
# Save the entire model (frozen base + trained classifier)
print("Saving final model and label encoder...")
torch.save(model.state_dict(), "wav2vec2_emotion_classifier_frozen_base.pth")

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model and encoder saved successfully.")