In [2]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
# from pydub import AudioSegment
from PIL import Image
import os
import re
import io
import itertools
import pyaudio
import wave
import time

import torch

from torchvision.transforms import Compose, ToTensor, Grayscale, Resize, Normalize

from torchvision.transforms import (
    Grayscale, ToTensor, Compose, Resize, InterpolationMode, Normalize, Lambda
)
import torch.nn.functional as F


In [3]:


# Audio settings
FORMAT = pyaudio.paInt16
CHANNELS = 1  # Mono recording
RATE = 16000  # Sampling rate in Hz
CHUNK = 1024  # Buffer size
RECORD_SECONDS = 3


def record_audio_to_numpy():
    audio = pyaudio.PyAudio()
    
    # Open the microphone stream
    
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)
    
    # print("Recording...")
    frames = []
    
    # Read data from the stream
    for _ in range(int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    
    # print("Recording finished.")
    
    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    # Convert recorded frames to numpy array
    audio_array = np.frombuffer(b''.join(frames), dtype=np.int16)
    return audio_array

def numpy_to_fft(samples, sr = 16000):
    n_fft=2048 
    win_length1 = 750 
    hop_length=win_length1//4 

    # Convert to NumPy array and normalize
    samples = samples.astype(np.float32)/float(np.max(samples))

    # Compute spectrogram using STFT
    S = librosa.stft(samples, n_fft=n_fft,win_length = win_length1, hop_length=hop_length)  

    #The number of rows in the STFT matrix D is (1 + n_fft/2).
    S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
    S0 = S_db

    fig, ax = plt.subplots(figsize=(2.56, 2.56), dpi=100)  # 256x256 pixels
    librosa.display.specshow(S_db, sr=sr, n_fft=n_fft, win_length=win_length1, hop_length=hop_length, x_axis="time", y_axis="log", cmap="gray")

    # Remove axes for a clean image
    ax.set_axis_off()
    plt.tight_layout(pad=0)
    # print(plt.ylim())
    plt.ylim([30,6000])
    buf = io.BytesIO()
    # plt.clf()
    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0, dpi=100)
    plt.close(fig)
    buf.seek(0)  # Move cursor to the start of the buffer

    # Process the image from memory
    img = Image.open(buf).convert("L")  # Convert to grayscale
    img = img.resize((256, 256))  # Resize to 256x256

    # Convert to NumPy array
    # img_array = np.array(img, dtype=np.uint8)
    # img_array = img_array[np.newaxis, :, :]  # Add channel dimension (1,256,256)
    
    # Close buffer
    buf.close()
    return img#Image.fromarray(img_array.squeeze())


def fft_to_tensor(pil_image):
   
    image = pil_image

    # Rotate 90 degrees (optional, remove if unnecessary)
    image = image.transpose(Image.ROTATE_90)

    # Ensure the image is in RGB mode (some formats might be grayscale)
    if image.mode != 'RGB':
        image = image.convert('RGB')

    transform = Compose([
                            ToTensor(),
                            Grayscale(),
                            Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
                            Normalize(mean=[0.5], std=[0.5])
                        ])
    

    input_tensor = transform(image)  # Shape: (1, 224, 224)
    input_tensor = input_tensor.unsqueeze(0)  # Add batch dimension -> (1, 1, 224, 224)
    if input_tensor.shape[1] == 1:  # Convert grayscale to RGB
        input_tensor = input_tensor.expand(-1, 3, -1, -1)  # Shape: (1, 3, H, W)

    # Apply transformations and return image with label
    return input_tensor

def evaluate(output):
    # Apply softmax to convert logits to probabilities
    probabilities = np.array(F.softmax(output, dim=1).squeeze())  # Shape: (num_classes,)
    threshold = 0.95
    # Class labels
    guess = np.argmax(probabilities)
    class_labels = ['arabic', 'english', 'german', 'mandarin', 'spanish', 'garbage']

    # Display probabilities
    if probabilities[guess]>threshold:
        print(class_labels[guess], round(probabilities[guess]*100, 2),"%")
    else:
        print(f"unsure ({class_labels[guess]}, { round(probabilities[guess]*100, 2)}%")

    

In [None]:
image = None
model_path = r"ENDG511_Final_Project/models/model_language_wgarb.pth"
model = torch.load(model_path, map_location="cpu", weights_only=False)
model.eval()  # Set to evaluation mode

for i in range(15):
    t0 = time.time()
    array = record_audio_to_numpy()
    dt = time.time() - t0
    # print("record time: ", dt)


    t0 = time.time()
    image = numpy_to_fft(array)
    dt = time.time() - t0
    # print("process time: ", dt)
    image.save("spectrogram.png")


    t0 = time.time()
    input_tensor = fft_to_tensor(image)
    with torch.no_grad(): output = model(input_tensor)  # Get raw logits
    dt = time.time() - t0
    # print("inference time: ", dt)

    evaluate(output)

    


german 95.28 %
english 99.93 %
english 99.99 %
german 99.76 %
unsure (english, 88.22%
english 99.99 %
english 99.33 %
unsure (german, 87.1%
english 98.34 %
unsure (english, 90.85%
english 99.93 %
english 99.96 %
english 99.68 %
english 100.0 %
unsure (english, 74.33%


In [None]:
# image = None
# model_path = r"ENDG511_Final_Project/models/model_language_wgarb.pth"
# model = torch.load(model_path, map_location="cpu", weights_only=False)
# model.eval()  # Set to evaluation mode


# temp_file = r"C:\Git_repos\ENDG 511\ENDG511_Final_Project\audio_processing\spectrogram_000.png"
file_path = r"C:\Git_repos\ENDG 511\ENDG511_Final_Project\languages"
for temp_file in os.listdir(file_path):
    # print(temp_file)
    filebad = 0
    with Image.open(file_path+"\\"+temp_file) as img:
        img_array = np.array(img)
        if np.all(img_array == 0):
            file_bad = 1
            print(temp_file, "bad")

    
            

    
    

english_100543000.png bad
english_100544000.png bad
english_100544001.png bad
english_100545000.png bad


In [None]:
# from torch import nn
# from functools import partial
# from torchvision.models import MobileNetV2


model = torch.load(r"C:\Git_repos\ENDG 511\model_language.pth", map_location="cpu", weights_only=False)
model.eval()  # Set to evaluation mode


In [None]:

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    output = model(input_tensor)  # Get raw logits

# Apply softmax to convert logits to probabilities
probabilities = F.softmax(output, dim=1).squeeze()  # Shape: (num_classes,)

# Class labels
class_labels = ['arabic', 'english', 'german', 'mandarin', 'spanish']

# Display probabilities
for label, prob in zip(class_labels, probabilities):
    print(f"{label}: {prob:.4f}")  # Format to 4 decimal places

<class 'torchvision.models.mobilenetv2.MobileNetV2'>
Not a dict


In [33]:
def find_first_unused_garbage_number(folder_path, file_name):
    pattern = re.compile(f"{file_name}(\\d+)\\.png")
    used_numbers = set()

    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            used_numbers.add(int(match.group(1)))

    if not used_numbers:
        return 0  # Return 0 if there are no files

    # Find the first missing number
    for i in range(1, 10001):
        if i not in used_numbers:
            return i

    return None  # If all numbers are used

In [39]:
GARBAGE_FOLDER = r"C:\Git_repos\ENDG 511\ENDG511_Final_Project\audio_processing\sample_dataset\garbage_audio"
GARBAGE_FOLDER = r"C:\Git_repos\ENDG 511\ENDG511_Final_Project\audio_processing\sample_dataset\english_extra"
file_name = "english"
print("saved:    ")
while True:
    array = record_audio_to_numpy()
    image = numpy_to_fft(array)
    number = find_first_unused_garbage_number(GARBAGE_FOLDER, file_name)
    image.save(f'{GARBAGE_FOLDER}\\{file_name}{number}.png')
    image.save(f'temp.png')

    print("\b\b\b", end="")
    print(number, end="")
    

saved:    
131313131313131313141414141414141414141515151515151515151516161616161616161616171717171717171717171818181818181818181819191919191919197

KeyboardInterrupt: 

In [20]:
print("hi"*5, end="")
print("\b1", end = "None")
print(len("121"))

hihihihih1None3
