In [1]:
import whisper
import csv
import os;
import warnings
from tqdm import tqdm
import jiwer 

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#can add other supported formates and can also write code to convert formates  which is not needed in our dataset
supported_formats = (".mp3") 

def transcribe_with_whisper(audio_dir):
    transcriptions = []
    files = [f for f in os.listdir(audio_dir) if f.endswith(supported_formats)]

    for filename in tqdm(files, desc="Transcribing audio files"):
        file_path = os.path.join(audio_dir, filename)
        
        # Load the Whisper model
        model = whisper.load_model("base")  #can change "base" to "large" for higher accuracy; 'base' is faster; 'large' gives better accuracy
        
        # Perform transcription
        result = model.transcribe(file_path)

        # Get ground truth from corresponding .txt file
        txt_filename = os.path.splitext(filename)[0] + ".txt"
        txt_path = os.path.join(audio_dir, txt_filename)
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8") as gt_file:
                ground_truth = gt_file.read()
                predicted_text = result["text"] 
                #Word Error Rate calculation
                error = jiwer.wer(ground_truth, 
                            predicted_text,
                            truth_transform=transforms,
                            hypothesis_transform=transforms,)
                #calculate accuracy
                accuracy = (1 - error) * 100
        else:
            ground_truth = ""
            accuracy = 0.0

        transcriptions.append({
            "filename": filename,
            "transcription": predicted_text,
            "ground_truth": ground_truth,
            "accuracy": round(accuracy, 2)
        })
    
    return transcriptions

def save_transcriptions_to_csv(transcriptions, output_file="baseTranscriptions.csv"):
    # Open the CSV file and write the transcriptions
     with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Filename", "Transcription", "Ground Truth", "Accuracy (%)"])
        for entry in transcriptions:
            writer.writerow([entry["filename"], entry["transcription"], entry["ground_truth"], entry["accuracy"]])

# Run transcription on the audio datasets
audio_dir = "dataset/a/"
transcriptions = transcribe_with_whisper(audio_dir)
print("Transcriptions : ", transcriptions)

# Calculate overall accuracy,
# Treat negative accuracies as 0
adjusted_accuracies = [entry["accuracy"] if entry["accuracy"] >= 0 else 0 for entry in transcriptions]

# Calculate overall accuracy
if adjusted_accuracies:
    overall_accuracy = sum(adjusted_accuracies) / len(adjusted_accuracies)
    print(f"\nOverall Accuracy (adjusted): {overall_accuracy:.2f}%")

# Save the transcriptions to CSV
save_transcriptions_to_csv(transcriptions)
print("Transcriptions saved to baseTranscriptions.csv")

Transcribing audio files: 100%|████████████████████████████████████████████████████████| 39/39 [02:56<00:00,  4.52s/it]

Transcriptions :  [{'filename': '103.536_107.319.mp3', 'transcription': " and the doctors taking the wavid's temperature.", 'ground_truth': 'and the doctor is taking the rabbit is temperature', 'accuracy': 66.67}, {'filename': '109.101_110.662.mp3', 'transcription': " And then he's all better.", 'ground_truth': 'and then he is all better', 'accuracy': 100.0}, {'filename': '116.372_119.993.mp3', 'transcription': ' I really enjoyed both.', 'ground_truth': 'the rabbit sees the animal with a balloon and he waves', 'accuracy': 0.0}, {'filename': '122.054_125.555.mp3', 'transcription': ' The rabbit is getting the balloon.', 'ground_truth': 'the rabbit is grabbing the balloon', 'accuracy': 83.33}, {'filename': '127.676_130.156.mp3', 'transcription': ' And now the weather is playing with the balloon.', 'ground_truth': 'and now the rabbit is playing with the balloon', 'accuracy': 88.89}, {'filename': '131.617_134.318.mp3', 'transcription': ' by accident he let go of it.', 'ground_truth': 'and b




In [2]:
import whisper
import csv
import os;
import warnings
from tqdm import tqdm
import jiwer 

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#can add other supported formates and can also write code to convert formates by using  which is not needed in our dataset
supported_formats = (".mp3") 

def transcribe_with_whisper(audio_dir):
    transcriptions = []
    files = [f for f in os.listdir(audio_dir) if f.endswith(supported_formats)]

    for filename in tqdm(files, desc="Transcribing audio files"):
        file_path = os.path.join(audio_dir, filename)
        
        # Load the Whisper model
        model = whisper.load_model("base")  #can change "base" to "large" for higher accuracy; 'base' is faster; 'large' gives better accuracy
        
        # Perform transcription
        result = model.transcribe(file_path)

        # Get ground truth from corresponding .txt file
        txt_filename = os.path.splitext(filename)[0] + ".txt"
        txt_path = os.path.join(audio_dir, txt_filename)
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8") as gt_file:
                ground_truth = gt_file.read()
                predicted_text = result["text"] 
                #Word Error Rate calculation
                error = jiwer.wer(ground_truth, 
                            predicted_text,
                            truth_transform=transforms,
                            hypothesis_transform=transforms,)
                #calculate accuracy
                accuracy = (1 - error) * 100
        else:
            ground_truth = ""
            accuracy = 0.0

        transcriptions.append({
            "filename": filename,
            "transcription": predicted_text,
            "ground_truth": ground_truth,
            "accuracy": round(accuracy, 2)
        })
    
    return transcriptions

def save_transcriptions_to_csv(transcriptions, output_file="baseTranscriptions1.csv"):
    # Open the CSV file and write the transcriptions
     with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Filename", "Transcription", "Ground Truth", "Accuracy (%)"])
        for entry in transcriptions:
            writer.writerow([entry["filename"], entry["transcription"], entry["ground_truth"], entry["accuracy"]])

# Run transcription on the audio datasets
audio_dir = "dataset/b/"
transcriptions = transcribe_with_whisper(audio_dir)
print("Transcriptions : ", transcriptions)

# Calculate overall accuracy,
# Treat negative accuracies as 0
adjusted_accuracies = [entry["accuracy"] if entry["accuracy"] >= 0 else 0 for entry in transcriptions]

# Calculate overall accuracy
if adjusted_accuracies:
    overall_accuracy = sum(adjusted_accuracies) / len(adjusted_accuracies)
    print(f"\nOverall Accuracy (adjusted): {overall_accuracy:.2f}%")

# Save the transcriptions to CSV
save_transcriptions_to_csv(transcriptions)
print("Transcriptions saved to baseTranscriptions1.csv")

Transcribing audio files: 100%|████████████████████████████████████████████████████████| 51/51 [09:08<00:00, 10.76s/it]

Transcriptions :  [{'filename': '104.73_105.551.mp3', 'transcription': ' all done.', 'ground_truth': 'all done', 'accuracy': 100.0}, {'filename': '121.222_121.803.mp3', 'transcription': ' learning. you', 'ground_truth': 'no running', 'accuracy': 0.0}, {'filename': '155.967_183.103.mp3', 'transcription': " The drafts can get her. Not to fall down. The elephant is hurt. In the elephant he's crying in the draft is help in the...", 'ground_truth': 'giraffe is going to get her not to fall down the elephant is hurt and the elephant he is crying and the giraffe is helping', 'accuracy': 60.0}, {'filename': '201.003_203.184.mp3', 'transcription': ' It hurts.', 'ground_truth': 'it hurts', 'accuracy': 100.0}, {'filename': '205.665_206.465.mp3', 'transcription': ' done', 'ground_truth': 'hold still', 'accuracy': 0.0}, {'filename': '206.585_208.246.mp3', 'transcription': " I won't hurt him.", 'ground_truth': 'it will not hurt', 'accuracy': 50.0}, {'filename': '214.989_215.429.mp3', 'transcription':


