In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
import pyaudio
import librosa
import time
from IPython.display import clear_output

In [6]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_labels, d_model=128, nhead=8, num_encoder_layers=2, dim_feedforward=512, dropout=0.3):
        super(TransformerModel, self).__init__()
        
        # Linear projection of input features
        self.input_proj = nn.Linear(input_dim, d_model)
        
        # Positional encoding (can be dynamically sized)
        self.positional_encoding = nn.Parameter(torch.zeros(1, d_model))
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # Output layer
        self.fc_out = nn.Linear(d_model, num_labels)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Ensure input has shape [batch_size, seq_len, input_dim]
        if x.dim() == 2:
            x = x.unsqueeze(1)  # Add seq_len dimension if it's missing

        batch_size, seq_len, _ = x.size()

        # Project the input features
        x = self.input_proj(x)

        # Add positional encoding
        positional_encoding = self.positional_encoding.unsqueeze(0).expand(batch_size, seq_len, -1)
        x += positional_encoding
        
        # Pass through the transformer encoder
        x = self.transformer_encoder(x)
        
        # Aggregate features (mean pooling)
        x = x.mean(dim=1)  # Average pooling over the sequence length
        
        # Pass through the output layer
        x = self.fc_out(x)
        
        # Sigmoid activation for multi-label classification
        return self.sigmoid(x)


In [7]:
# Load the model
model = TransformerModel(input_dim=170, num_labels=7)
model.load_state_dict(torch.load('model/transformer_model_2.pth'))
model.eval()

TransformerModel(
  (input_proj): Linear(in_features=170, out_features=128, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=7, bias=True)
  (sigmoid): Sigmoid()
)

In [8]:
import librosa.feature

def extract_zcr(audio):
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    zcr_mean = np.mean(zcr.T, axis=0)
    # print("extract_zcr", zcr.shape, zcr_mean.shape)
    return zcr_mean

def extract_chroma(audio, sr):
    chroma = librosa.feature.chroma_stft(S=audio, sr=sr)
    chroma_mean = np.mean(chroma.T, axis=0)
    # print("extract_chroma", chroma.shape, chroma_mean.shape)
    return chroma_mean

def extract_mfccs(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    # print("extract_mfccs", mfccs.shape, mfccs_mean.shape)
    return mfccs_mean

def extract_spectral_contrast(audio, sr):
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
    # print("extract_spectral_contrast", spectral_contrast.shape, spectral_contrast_mean.shape)
    return spectral_contrast_mean

def extract_spectral_rolloff(audio, sr):
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    spectral_rolloff_mean = np.mean(spectral_rolloff.T, axis=0)
    # print("extract_spectral_rolloff", spectral_rolloff.shape, spectral_rolloff_mean.shape)
    return spectral_rolloff_mean

def extract_rmse(audio):
    rmse = librosa.feature.rms(y=audio)
    rmse_mean = np.mean(rmse.T, axis=0)
    # print("extract_rmse", rmse.shape, rmse_mean.shape)
    return rmse_mean

def extract_mel_spectrogram(audio, sr):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spectrogram_mean = np.mean(mel_spectrogram.T, axis=0)
    # print("extract_mel_spectrogram", mel_spectrogram.shape, mel_spectrogram_mean.shape)
    return mel_spectrogram_mean

def extract_features(data, sample_rate):
    result = np.array([])
    
    # ZCR
    zcr = extract_zcr(data)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = extract_chroma(stft, sample_rate)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = extract_mfccs(data, sample_rate)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Spectral_contrast
    spectral_contrast = extract_spectral_contrast(data, sample_rate)
    result = np.hstack((result, spectral_contrast)) # stacking horizontally

    # Spectral_rolloff
    spectral_rolloff = extract_spectral_rolloff(data, sample_rate)
    result = np.hstack((result, spectral_rolloff)) # stacking horizontally

    # RMS
    rms = extract_rmse(data)
    result = np.hstack((result, rms)) # stacking horizontally

    # Mel_spectrogram
    mel_spectrogram = extract_mel_spectrogram(data, sample_rate)
    result = np.hstack((result, mel_spectrogram)) # stacking horizontally
    
    return result

def get_features(path):
    data, sample_rate = librosa.load(path)
    
    features = extract_features(data, sample_rate)
    result = np.array(features)
    
    return result

In [32]:
# Set up audio parameters
FORMAT = pyaudio.paFloat32
CHANNELS = 1
RATE = 22050  # Sample rate
CHUNK = 4096*2  # Number of frames per buffer

# Initialize the audio stream
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("Listening...")

try:
    while True:
        # Read audio data from the microphone
        data = np.frombuffer(stream.read(CHUNK), dtype=np.float32)
        
        # Extract features (MFCCs)
        features = extract_features(data, RATE)
        
        # Prepare features for the model
        features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

        # Get model predictions
        with torch.no_grad():
            probabilities = model(features_tensor)
        
        # Convert to percentages
        percentages = probabilities.numpy() * 100
        
        # Display the result
        print("Predicted Probabilities: ", 
              "{:>7.2f}%".format(percentages.flatten()[0]),
              "{:>7.2f}%".format(percentages.flatten()[1]),
              "{:>7.2f}%".format(percentages.flatten()[2]),
              "{:>7.2f}%".format(percentages.flatten()[3]),
              "{:>7.2f}%".format(percentages.flatten()[4]),
              "{:>7.2f}%".format(percentages.flatten()[5]),
              "{:>7.2f}%".format(percentages.flatten()[6]), end="\r"
              )
        
except KeyboardInterrupt:
    print("Stopping...")
finally:
    # Close the stream
    stream.stop_stream()
    stream.close()
    p.terminate()
    
# "angry" "disgust" "fear" "happy" "neutral+calm" "sad" "surprised"

Listening...
Stopping...robabilities:     4.94%   20.77%    1.71%    0.72%    1.80%    3.12%   70.04%
