In [22]:
import os
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up the Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-large-960h"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name).to(device)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Directory containing audio files
audio_dir = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Function to extract distances from transcription
def extract_distances(transcription):
    distances = []
    if "METERS" in transcription.upper():
        words = transcription.split()
        for i, word in enumerate(words):
            if word.upper() == "METERS":
                try:
                    distance = words[i-1]
                    distances.append(distance)
                except IndexError:
                    pass
    return distances

# Function to classify intents based on extracted distances
def classify_intents(distances):
    intents = {
        'move_forward': 0,
        'move_backward': 0,
        'move_up': 0,
        'move_down': 0
    }
    for distance in distances:
        if "FORWARD" in distance.upper():
            intents['move_forward'] += int(distance.split()[0])
        elif "BACKWARD" in distance.upper():
            intents['move_backward'] += int(distance.split()[0])
        elif "UP" in distance.upper():
            intents['move_up'] += int(distance.split()[0])
        elif "DOWN" in distance.upper():
            intents['move_down'] += int(distance.split()[0])
    return intents

# Function to process each audio file
def process_audio_file(file_path):
    try:
        # Load audio and resample if needed (assuming all are already 16kHz)
        audio_input, _ = torchaudio.load(file_path)
        audio_input = audio_input / torch.max(torch.abs(audio_input))
        
        # Transcribe speech to text
        input_values = processor(audio_input.squeeze(0), sampling_rate=16000, return_tensors="pt").input_values.to(device)
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        
        # Extract distances from transcription
        distances = extract_distances(transcription)
        
        # Classify intents based on extracted distances
        intents = classify_intents(distances)
        
        # Print results
        print(f"Audio File: {file_path}")
        print(f"Transcription: {transcription}")
        print(f"Extracted Distances: {distances}")
        print(f"Extracted Intents: {intents}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


In [24]:
# Process each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        file_path = os.path.join(audio_dir, filename)
        process_audio_file(file_path)


Audio File: C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2\01 - please go 5 meters forwa 2.wav
Transcription: <s>
Extracted Distances: []
Extracted Intents: {'move_forward': 0, 'move_backward': 0, 'move_up': 0, 'move_down': 0}
Audio File: C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2\02 - move 10 meters backw 1.wav
Transcription: <s>
Extracted Distances: []
Extracted Intents: {'move_forward': 0, 'move_backward': 0, 'move_up': 0, 'move_down': 0}
Audio File: C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2\03 - can you move forward 1.wav
Transcription: <s>
Extracted Distances: []
Extracted Intents: {'move_forward': 0, 'move_backward': 0, 'move_up': 0, 'move_down': 0}
Audio File: C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2\04 - please fly up 4 mete 1.wav
Transcription: <s>
Extracted Distances: []
Extracted Intents: {'move_forward': 0, 'move_backward': 0, 'move_up': 0, 'move_down': 0}


KeyboardInterrupt: 

In [13]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    pattern = re.compile(r'\b(\d+)\s*(meters|metres|mtrs|m|centimeters|cm)\s*(forward|up|backward|down)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    instructions = []
    
    for match in matches:
        distance, unit, direction = match
        distance = int(distance)
        direction = direction.lower()
        unit = unit.lower()

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        for instruction in instructions:
            print(instruction)
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (1).wav
Transcription: please go 5 metres
Instructions:

Processing file: synthesis (2).wav
Transcription: please play a 4 m
Instructions:

Processing file: synthesis (3).wav
Transcription: the three metres forward and Fly Up 3 metres
Instructions:



In [12]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    pattern = re.compile(r'\b(\d+)\s*(meters|metres|mtrs|m|centimeters|cm)\s*(forward|up|backward|down)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    instructions = []
    
    for match in matches:
        distance, unit, direction = match
        distance = int(distance)
        direction = direction.lower()
        unit = unit.lower()

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")

        print()  # Print a blank line for clarity between different files


Processing file: synthesis (1).wav
Transcription: please go 5 metres
Instructions:
No instructions found.

Processing file: synthesis (2).wav
Transcription: please play a 4 m
Instructions:
No instructions found.

Processing file: synthesis (3).wav
Transcription: the three metres forward and Fly Up 3 metres
Instructions:
No instructions found.



In [1]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    pattern = re.compile(r'\b(\d+|\w+)\s*(meters|metres|mtrs|m|centimeters|cm)\s*(forward|up|backward|down)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    instructions = []
    
    for match in matches:
        distance, unit, direction = match
        try:
            distance = int(distance)
        except ValueError:
            distance_words = {
                "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
                "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
            }
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower()

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: 01 - please go 5 meters forwa 2.wav
Transcription: please go 5 minutes forward and up by 3 metres
Instructions:
No instructions found.

Processing file: 02 - move 10 meters backw 1.wav
Transcription: 10 M backwards and then fly up to metres
Instructions:
No instructions found.

Processing file: 03 - can you move forward 1.wav
Transcription: can you move forward by 7 metres and down by 1 m
Instructions:
No instructions found.

Processing file: 04 - please fly up 4 mete 1.wav
Transcription: please fire 4 metres and move forward 6 metres
Instructions:
No instructions found.

Processing file: 05 - go 3 meters forward 1.wav
Transcription: 3 metres forward and then up by 2 metres
Instructions:
Move forward by 3 metres

Processing file: 06 - move back 5 meters a 1.wav
Transcription: back 5 metres in fire 3 m
Instructions:
No instructions found.

Processing file: 07 - advance forward 8 me 1.wav
Transcription: stand by 4 metres
Instructions:
No instructions found.

Processing f

In [4]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    # Debugging output
    print("Debug: Transcription for regex matching -", transcription)
    
    # Expanded regex pattern
    pattern = re.compile(
        r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s*'
        r'(minutes|meters|metres|mtrs|m|centimeters|cm)?\s*(forward|up|backward|down)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    # Debugging output
    print("Debug: Regex matches -", matches)

    instructions = []
    distance_words = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    
    for match in matches:
        distance, unit, direction = match
        if distance.isdigit():
            distance = int(distance)
        else:
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower() if unit else 'meters'

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: 01 - please go 5 meters forwa 2.wav
Transcription: please go 5 minutes forward and up by 3 metres
Debug: Transcription for regex matching - please go 5 minutes forward and up by 3 metres
Debug: Regex matches - [('5', 'minutes', 'forward')]
Instructions:
Move forward by 5 minutes

Processing file: 02 - move 10 meters backw 1.wav
Transcription: 10 M backwards and then fly up to metres
Debug: Transcription for regex matching - 10 M backwards and then fly up to metres
Debug: Regex matches - []
Instructions:
No instructions found.

Processing file: 03 - can you move forward 1.wav
Transcription: can you move forward by 7 metres and down by 1 m
Debug: Transcription for regex matching - can you move forward by 7 metres and down by 1 m
Debug: Regex matches - []
Instructions:
No instructions found.

Processing file: 04 - please fly up 4 mete 1.wav
Transcription: please fire 4 metres and move forward 6 metres
Debug: Transcription for regex matching - please fire 4 metres and move

In [21]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    # Correct common transcription errors
    transcription = transcription.replace("fire", "fly")

    # Debugging output
    print("Debug: Transcription for regex matching -", transcription)
    
    # Expanded regex pattern to capture more variations
    pattern = re.compile(
        r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s*'
        r'(meters|metres||M|mtrs|m|centimeters|cm|minutes)?\s*(forward|up|backward|down)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    # Debugging output
    print("Debug: Regex matches -", matches)

    instructions = []
    distance_words = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    
    for match in matches:
        if len(match) == 3:
            distance, unit, direction = match
        else:
            distance, direction = match[0], match[1]
            unit = 'meters, M'  # Default unit

        if distance.isdigit():
            distance = int(distance)
        else:
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower() if unit else 'meters , M'

        if direction == "forward":
            instructions.append(f"Move forward  {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward  {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down  {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Debug: Transcription for regex matching - the three metres forward and 53 metres
Debug: Regex matches - [('three', 'metres', 'forward')]
Instructions:
Move forward  3 metres

Processing file: WhatsApp Audio 2024-07-09 at 20.17.03.wav
Transcription: move forward 5 M and fly
Debug: Transcription for regex matching - move forward 5 M and fly
Debug: Regex matches - []
Instructions:
No instructions found.



In [3]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    # Debugging output
    print("Debug: Transcription for regex matching -", repr(transcription))
 
    
    # Expanded regex pattern
    pattern = re.compile(
        r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s*'
        r'(minutes|meters|metres|mtrs|m|centimeters|cm)?\s*(forward|up|backward|down|left|right)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    # Debugging output
    print("Debug: Regex matches -", matches)

    instructions = []
    distance_words = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    
    for match in matches:
        distance, unit, direction = match
        if distance.isdigit():
            distance = int(distance)
        else:
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower() if unit else 'meters'

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")
        elif direction == "left":
            instructions.append(f"Move left by {distance} {unit}")
        elif direction == "right":
            instructions.append(f"Move right by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: 1720607727799exjkkj64-voicemaker.in-speech.wav
Transcription: apply up by 10 metres
Debug: Transcription for regex matching - 'apply up by 10 metres'
Debug: Regex matches - []
Instructions:
No instructions found.

Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Debug: Transcription for regex matching - 'the three metres forward and 53 metres'
Debug: Regex matches - [('three', 'metres', 'forward')]
Instructions:
Move forward by 3 metres

Processing file: WhatsApp Audio 2024-07-09 at 20.17.03.wav
Transcription: move forward 5 M and fly
Debug: Transcription for regex matching - 'move forward 5 M and fly'
Debug: Regex matches - []
Instructions:
No instructions found.



In [1]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to normalize the transcription
def normalize_transcription(transcription):
    transcription = transcription.lower()
    transcription = re.sub(r'\bapply\b', 'move', transcription)  # Example normalization
    transcription = re.sub(r'\bmeters?\b', 'metres', transcription)  # Normalize units variations
    transcription = re.sub(r'\bm\b', 'metres', transcription)  # Normalize unit shorthand
    return transcription

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    # Normalize the transcription
    transcription = normalize_transcription(transcription)
    
    # Debugging output
    print("Debug: Transcription for regex matching -", repr(transcription))
    
    # Expanded regex pattern
    pattern = re.compile(
        r'\b(\d+|one|two|three|four|five| |six|seven|eight|nine|ten)\s*'
        r'(minutes|metres|mtrs|m|M|centimeters|cm)?\s*(forward|up|backward|down|left|right|fly)\b', re.IGNORECASE)
    matches = pattern.findall(transcription)

    # Debugging output
    print("Debug: Regex matches -", matches)

    instructions = []
    distance_words = {
        "one": 1, "two": 2, "three": 3, "four": 4, "5": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    
    for match in matches:
        distance, unit, direction = match
        if distance.isdigit():
            distance = int(distance)
        else:
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower() if unit else 'meters'

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")
        elif direction == "left":
            instructions.append(f"Move left by {distance} {unit}")
        elif direction == "right":
            instructions.append(f"Move right by {distance} {unit}")
        elif direction == "fly":
            instructions.append(f"Fly by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Debug: Transcription for regex matching - 'the three metres forward and 53 metres'
Debug: Regex matches - [('three', 'metres', 'forward')]
Instructions:
Move forward by 3 metres

Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 M
Debug: Transcription for regex matching - 'move forward 5 metres'
Debug: Regex matches - [(' ', '', 'forward')]
Instructions:
Move forward by 0 meters



In [9]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to normalize the transcription
def normalize_transcription(transcription):
    transcription = transcription.lower()
    transcription = re.sub(r'\bm\b', 'metres', transcription)  # Normalize unit shorthand
    transcription = re.sub(r'\bm\s', 'metres ', transcription)  # Normalize unit shorthand with space
    transcription = re.sub(r'\bmeters?\b', 'metres', transcription)  # Normalize units variations
    transcription = re.sub(r'\bapply\b', 'move', transcription)  # Example normalization
    transcription = re.sub(r'\bm\b', 'metres', transcription, flags=re.IGNORECASE)  # Handle uppercase M
    transcription = re.sub(r'(\d)\s*m\b', r'\1 metres', transcription, flags=re.IGNORECASE)  # Handle "5 M" case
    return transcription

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    # Normalize the transcription
    normalized_transcription = normalize_transcription(transcription)
    
    # Debugging output
    print("Debug: Original Transcription -", repr(transcription))
    print("Debug: Normalized Transcription -", repr(normalized_transcription))
    
    # Expanded regex pattern
    pattern = re.compile(
        r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\s*'
        r'(metres?|mtrs?|centimeters?|cm|meters?|m)?\s*(forward|up|backward|down|left|right|fly)\b', re.IGNORECASE)
    
    matches = pattern.findall(normalized_transcription)

    # Debugging output
    print("Debug: Regex matches -", matches)

    instructions = []
    distance_words = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    
    for match in matches:
        distance, unit, direction = match
        print(f"Debug: Processing match - distance: {distance}, unit: {unit}, direction: {direction}")
        if distance.isdigit():
            distance = int(distance)
        else:
            distance = distance_words.get(distance.lower(), 0)
        
        direction = direction.lower()
        unit = unit.lower() if unit else 'metres'

        if direction == "forward":
            instructions.append(f"Move forward by {distance} {unit}")
        elif direction == "up":
            instructions.append(f"Fly up by {distance} {unit}")
        elif direction == "backward":
            instructions.append(f"Move backward by {distance} {unit}")
        elif direction == "down":
            instructions.append(f"Move down by {distance} {unit}")
        elif direction == "left":
            instructions.append(f"Move left by {distance} {unit}")
        elif direction == "right":
            instructions.append(f"Move right by {distance} {unit}")
        elif direction == "fly":
            instructions.append(f"Fly by {distance} {unit}")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:")
        if instructions:
            for instruction in instructions:
                print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Debug: Original Transcription - 'the three metres forward and 53 metres'
Debug: Normalized Transcription - 'the three metres forward and 53 metres'
Debug: Regex matches - [('three', 'metres', 'forward')]
Debug: Processing match - distance: three, unit: metres, direction: forward
Instructions:
Move forward by 3 metres

Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 M
Debug: Original Transcription - 'move forward 5 M'
Debug: Normalized Transcription - 'move forward 5 metres'
Debug: Regex matches - []
Instructions:
No instructions found.



In [10]:
import os
import speech_recognition as sr

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text.lower()  # Convert to lowercase for consistency
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    instructions = []
    
    # Check for common variations
    if "move forward" in transcription:
        instructions.append("Move forward")
    elif "fly" in transcription and "up" in transcription:
        instructions.append("Fly up")
    elif "move backward" in transcription:
        instructions.append("Move backward")
    elif "move down" in transcription:
        instructions.append("Move down")
    elif "move left" in transcription:
        instructions.append("Move left")
    elif "move right" in transcription:
        instructions.append("Move right")
    else:
        instructions.append("No instructions found.")

    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:", instructions[0])  # Print only the first instruction for simplicity
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Instructions: No instructions found.

Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 m
Instructions: Move forward



In [11]:
import os
import speech_recognition as sr

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text.lower()  # Convert to lowercase for consistency
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    instructions = []
    
    if "move forward" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Move forward by {distance} meters")
    
    elif "fly up" in transcription or "fly" in transcription and "up" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Fly up by {distance} meters")
    
    elif "move backward" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Move backward by {distance} meters")
    
    elif "move down" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Move down by {distance} meters")
    
    elif "move left" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Move left by {distance} meters")
    
    elif "move right" in transcription:
        # Extract distance
        distance = extract_distance(transcription)
        if distance:
            instructions.append(f"Move right by {distance} meters")
    
    else:
        instructions.append("No instructions found.")

    return instructions

# Function to extract distance from transcription
def extract_distance(transcription):
    words = transcription.split()
    for i, word in enumerate(words):
        if word.isdigit():
            return int(word)
        elif word in ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]:
            if i+1 < len(words) and words[i+1] in ["metres", "m", "meters"]:
                return int(convert_word_to_number(word))
    return None

# Function to convert word to number
def convert_word_to_number(word):
    words_to_numbers = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    return words_to_numbers[word]

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        print("Instructions:", instructions[0] if instructions else "No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: synthesis (4).wav
Transcription: the three metres forward and 53 metres
Instructions: No instructions found.

Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 m
Instructions: Move forward by 5 meters



In [15]:
import os
import speech_recognition as sr

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text.lower()  # Convert to lowercase for consistency
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    instructions = []
    
    if "move forward" in transcription:
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Move forward by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "forward"
                }
            })
    
    elif "fly up" in transcription or ("fly" in transcription and "up" in transcription):
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Fly up by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "up"
                }
            })
    
    elif "move backward" in transcription:
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Move backward by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "backward"
                }
            })
    
    elif "move down" in transcription:
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Move down by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "down"
                }
            })
    
    elif "move left" in transcription:
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Move left by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "left"
                }
            })
    
    elif "move right" in transcription:
        distance, unit = extract_distance(transcription)
        if distance:
            instructions.append({
                "instruction": f"Move right by {distance} {unit}",
                "details": {
                    "distance": distance,
                    "unit": unit,
                    "direction": "right"
                }
            })
    
    else:
        instructions.append("No instructions found.")

    return instructions

# Function to extract distance and unit from transcription
def extract_distance(transcription):
    words = transcription.split()
    distance = None
    unit = "metres"
    
    for i, word in enumerate(words):
        if word.isdigit():
            distance = int(word)
        elif word in ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]:
            if i+1 < len(words) and words[i+1] in ["metres", "m", "meters"]:
                distance = int(convert_word_to_number(word))
        elif word in ["metres", "m", "meters"]:
            unit = "metres"
        elif word in ["centimeters", "cm"]:
            unit = "centimeters"
    
    return distance, unit

# Function to convert word to number
def convert_word_to_number(word):
    words_to_numbers = {
        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
    }
    return words_to_numbers[word]

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        if isinstance(instructions, list) and instructions:
            for instruction in instructions:
                if isinstance(instruction, dict):
                    print("Instruction:", instruction["instruction"])
                    print("Details:")
                    print(f"Distance: {instruction['details']['distance']}, "
                          f"Unit: {instruction['details']['unit']}, "
                          f"Direction: {instruction['details']['direction']}")
                    print()
                else:
                    print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: 02 - move 10 meters backw 1.wav
Transcription: 10 m backwards and then fly up to metres
Instruction: Fly up by 10 metres
Details:
Distance: 10, Unit: metres, Direction: up


Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 m
Instruction: Move forward by 5 metres
Details:
Distance: 5, Unit: metres, Direction: forward




In [24]:
import os
import speech_recognition as sr
import re

# Function to transcribe audio using Google Web Speech API
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)  # Read the entire audio file

    try:
        text = recognizer.recognize_google(audio)
        return text.lower()  # Convert to lowercase for consistency
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Could not request results; check your network connection"

# Function to extract and classify instructions from the transcription
def extract_and_classify_instructions(transcription):
    instructions = []
    
    # Define regex patterns for different types of instructions
    patterns = [
        (r'(\d+)\s*(m(?:eters?)?|centimeters?|cm)?\s*(forward|up|backward|down|left|right|fly)\b', "direct"),
        (r'(forward|up|backward|down|left|right|fly)\s*(\d+)\s*(m(?:eters?)?|centimeters?|cm)?\b', "reverse")
    ]
    
    # Process each pattern
    for pattern, direction_type in patterns:
        matches = re.findall(pattern, transcription)
        
        for match in matches:
            if direction_type == "direct":
                distance, unit, direction = match
            elif direction_type == "reverse":
                direction, distance, unit = match
            
            distance = int(distance) if distance.isdigit() else None
            unit = unit if unit else 'metres'
            
            if direction:
                direction = direction.lower()
                if direction in ['forward', 'up', 'backward', 'down', 'left', 'right', 'fly']:
                    instructions.append({
                        "instruction": f"Move {direction} by {distance} {unit}",
                        "details": {
                            "distance": distance,
                            "unit": unit,
                            "direction": direction
                        }
                    })
    
    if not instructions:
        instructions.append("No valid instructions found.")
    
    return instructions

# Directory path where audio files are located
directory_path = r"C:\Users\tanya\OneDrive\Desktop\pytrch\nkb\New2"

# Process each audio file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".wav"):  # Adjust the extension if your files are in a different format
        audio_path = os.path.join(directory_path, filename)
        print(f"Processing file: {filename}")

        # Step 1: Transcribe the audio
        transcription = transcribe_audio(audio_path)
        print("Transcription:", transcription)

        # Step 2: Extract and classify instructions
        instructions = extract_and_classify_instructions(transcription)
        
        if isinstance(instructions, list) and instructions:
            for instruction in instructions:
                if isinstance(instruction, dict):
                    print("Instruction:", instruction["instruction"])
                    print("Details:")
                    print(f"Distance: {instruction['details']['distance']}, "
                          f"Unit: {instruction['details']['unit']}, "
                          f"Direction: {instruction['details']['direction']}")
                    print()
                else:
                    print(instruction)
        else:
            print("No instructions found.")
        print()  # Print a blank line for clarity between different files


Processing file: p_24702234_435.wav
Transcription: new forward 5 metres
Instruction: Move forward by 5 metres
Details:
Distance: 5, Unit: metres, Direction: forward


Processing file: p_24702308_517.wav
Transcription: forward 5 metres
Instruction: Move forward by 5 metres
Details:
Distance: 5, Unit: metres, Direction: forward


Processing file: p_24702336_570.wav
Transcription: forward 3 metres
Instruction: Move forward by 3 metres
Details:
Distance: 3, Unit: metres, Direction: forward


Processing file: WhatsApp Audio 2024-07-10 at 16.37.56.wav
Transcription: move forward 5 m
Instruction: Move forward by 5 m
Details:
Distance: 5, Unit: m, Direction: forward


