In [None]:
from speech.model import *

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Load the model
model1 = CNN_1().to(device)
model1.load_state_dict(torch.load('outputs/model_CNN_2_1.t'))
model1.eval()
model2 = CNN_2().to(device)
model2.load_state_dict(torch.load('outputs/model_CNN_2_2.t'))
model2.eval()

In [3]:
from IPython.display import Audio

# Provide the path to the audio file
Audio("../data/speech/3.mp3")

In [None]:
from faster_whisper import WhisperModel

# model_size = "large-v3"
model_size = "medium.en"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("../data/speech/1.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

config.json:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/422k [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

In [1]:
segments, info = model.transcribe("../data/speech/1.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

In [None]:
import torch
import pyaudio
import numpy as np
import librosa

# Audio configuration
SAMPLE_RATE = 22050  # Hz
DURATION = 1  # seconds
FRAME_SIZE = int(SAMPLE_RATE * DURATION)
CHANNELS = 1
FORMAT = pyaudio.paInt16
N_MELS = 16
TIME_FRAMES = int(704 / N_MELS)
emotions = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprised"]

# Initialize PyAudio
p = pyaudio.PyAudio()

# Callback function for real-time classification
def callback(in_data, frame_count, time_info, status):
    # Convert bytes to numpy array
    audio_data = np.frombuffer(in_data, dtype=np.int16).astype(np.float32)
    audio_data = noise_reduction(audio_data, SAMPLE_RATE)

    # Extract features
    extracted_features, mel_spectrogram = extract_features(audio_data, SAMPLE_RATE)
    
    # Prepare input tensors
    features_tensor = torch.tensor(extracted_features, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    mel_spectrogram_tensor = torch.tensor(mel_spectrogram.reshape(-1, 1, N_MELS, TIME_FRAMES), dtype=torch.float32)
    
    # Get predictions
    with torch.no_grad():
        output_cnn1 = model1(features_tensor.to(device))
        output_cnn2 = model2(mel_spectrogram_tensor.to(device))
    
    # Calculate the average probability for each class
    avg_probabilities = (output_cnn1 + output_cnn2) / 2
    avg_probabilities = avg_probabilities * 100
    
    avg_probabilities = avg_probabilities.flatten()
    # avg_probabilities = [
    #     avg_probabilities[0] - ,
    #                      
    #                      ]
    max_index = np.argmax(avg_probabilities.to("cpu"))
    emotion = emotions[max_index]
    # Print the probabilities and their average
    # print(f"CNN_1 Probabilities: {output_cnn1.numpy().flatten()}")
    # print(f"CNN_2 Probabilities: {output_cnn2.numpy().flatten()}")
    # print(f"Average Probabilities: {avg_probabilities.numpy().flatten()}")

    output = ("Predicted Probabilities: " 
          + "{:>7.2f}%".format(avg_probabilities.flatten()[0])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[1])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[2])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[3])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[4])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[5])
          + "{:>7.2f}%".format(avg_probabilities.flatten()[6])
          + f"  {emotion}")
    print(output.ljust(100), end="\r")

    return (in_data, pyaudio.paContinue)

# Open the audio stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=SAMPLE_RATE,
                input=True,
                frames_per_buffer=FRAME_SIZE,
                stream_callback=callback)

# Start the stream
print("Listening... Press Ctrl+C to stop.")
print("                         ", 
      "angry   ",
      "disgust ",
      "fear  ",
      "happy  ",
      "neutral ",
      "sad  ",
      "surprised",)
stream.start_stream()

try:
    while stream.is_active():
        pass
except KeyboardInterrupt:
    print("Stopped.")
finally:
    stream.stop_stream()
    stream.close()
    p.terminate()
    
# 5, 48, 25, 28,  
