In [None]:
#%pip install transformers
#%pip install torchaudio
#%pip install sounddevice scipy
#%pip install soundfile
#%pip install pyserial

In [None]:
from transformers import (Wav2Vec2Processor, Wav2Vec2ForCTC)  
import torchaudio  
import torch 

# Function to convert speech file to array
def speech_file_to_array_fn(voice_path, resampling_to=16000):
    # Load the audio file and get the waveform and sampling rate
    speech_array, sampling_rate = torchaudio.load(voice_path)
    
    # Initialize a resampling transform to resample audio if needed
    resampler = torchaudio.transforms.Resample(sampling_rate, resampling_to)

    # Resample the audio and convert to a NumPy array
    return resampler(speech_array)[0].numpy(), sampling_rate

# Define the pre-trained model checkpoint or identifier
cp = "bakrianoo/sinai-voice-ar-stt"

# Initialize Wav2Vec2 processor and model using the pre-trained checkpoint
processor = Wav2Vec2Processor.from_pretrained(cp)
model = Wav2Vec2ForCTC.from_pretrained(cp)

In [None]:
import time
import sounddevice as sd
import torchaudio
import serial
from scipy.io.wavfile import write

def record_audio(duration=5, filename="output.wav", sample_rate=44100):
    print("Recording...")
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
    sd.wait()
    print("Recording finished")
    write(filename, sample_rate, recording)


# Change the duration, filename, and sample rate as needed
record_duration = 3  # in seconds
output_filename = "my_recording.wav"
sample_rate = 44100  # 44100 is a common sample rate

record_audio(duration=record_duration, filename=output_filename, sample_rate=sample_rate)

sound_path = 'my_recording.wav'

sample, sr = speech_file_to_array_fn(sound_path)
inputs = processor([sample], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)

transcription_string = processor.batch_decode(predicted_ids)[0]  # Get the transcription as a string

print("Prediction:", transcription_string)

# Now 'transcription_string' contains the transcription as a string

arduino_port = 'COM8'
baud_rate = 9600

ser = serial.Serial(arduino_port, baud_rate)
time.sleep(2)

command = transcription_string
ser.write(command.encode('utf-8') + b'\n')
print(f"Sent '{command}' command to Arduino")

# Close the serial connection
ser.close()