# Things to load, import and install before starting

In [None]:
!pip install openai-whisper #Whisper models
!pip install jiwer #WER computation
!pip install codecarbon #carbon footprint of my code

In [2]:
import whisper
from jiwer import wer
import os
import jiwer
import json
import re
from codecarbon import EmissionsTracker
from statistics import mean
import matplotlib.pyplot as plt


In [3]:
#only for google colab uses, to access the files
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive

Mounted at /content/drive/
/content/drive/My Drive


In [None]:
#load the models
tiny_model = whisper.load_model("tiny")
small_model = whisper.load_model("small")

100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 125MiB/s]
100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 108MiB/s]


# All function used for transcriptions, data processing and WER computation

In [4]:
#transcriptions
def transcribeAllFiles(model_v1, model_v2, path_to_folder:str):
    """
    provide the transcription of all .wav files contained in a folder in a dictionnary with two different versions of the Whisper model : model_v1 and model_v2

    :param str path_to_folder : path to the mother-folder which contains all .wav files, default to '.' (current folder)
    :param whisper.model.Whisper model_v1 : model, eg. Whisper-tiny
    :param whisper.model.Whisper model_v2 : model, eg. Whisper-small
    """

    if path_to_folder is None:
        path_to_folder = '.'
    else:
        %cd $path_to_folder
        !pwd
    #results is a dictionnary with the name of the file as a key and a tuple with tiny-transcription and small-transcription
    results = {}
    not_everything=0 #used to test, since running this function can take some time if all files are transribed

    # go through all files in the current folder
    for f in os.listdir(path_to_folder):
        if f.endswith('.wav'): #and not_everything<5:
            not_everything += 1
            print(not_everything)
            # Transcription with first model
            tiny_result = model_v1.transcribe(f)
            tiny_text = tiny_result["text"]

            # Transcription with the other model
            small_result = model_v2.transcribe(f)
            small_text = small_result["text"]

            results[f] = (tiny_text, small_text)

    return results

def create_json_file(input_dict, category, file_path):
    """
    Converts the input dictionary and category into a json-formatted string.

    :param input_dict: Dictionary with filenames as keys and tuples (tiny, small transcriptions) as values.
    :param category: String representing the category.
    :return: A json-formatted string.
    """
    output = {"category": category, "files": []}

    for filename, (tiny_transcription, small_transcription) in input_dict.items():
        file_info = {
            "filename": filename,
            "transcriptions": {
                "tiny": tiny_transcription,
                "small": small_transcription
            }
        }
        output["files"].append(file_info)

    json_file = json.dumps(output, indent=4)

    with open(file_path, 'w') as file:
        file.write(json_file)

# Function to create a JSON file from the results dictionary
def create_json_file_WER(results, filename):
    """
    create a JSON file from the results dictionary

    :param results dict: dictionnary to convert to json file
    :param filename str: name of the file to put json dump

    """
    # Open the file for writing
    with open(filename, 'w') as file:
        # Dump the dictionary into the file as a JSON formatted string
        json.dump(results, file, indent=4)

# Display of results for test
def display_transcriptions(results : dict):
    """
    display of transcriptions

    :param results dict: name of the file as a key, tuple with tiny and small transcription as value
    """
    for f, (tiny_text, small_text) in results.items():
        print(f"File: {f}")
        print(f"Tiny Model: {tiny_text}")
        print(f"Small Model: {small_text}")
        print("-" * 30)

#evaluation visualisation --> not used for WER, just for test to understand how jiwer works
def visualisation_evaluation(gold_data, transcriptions_dict):
    """
    visualise properties of the evaluation of transcription compared to gold data (comparison of strings, WER, MER, WIL, WIP, CER)

    :param gold_data
    :param transcriptions_dict
    """

    for fichier, (tiny_text, small_text) in transcriptions_dict.items():

        out_tiny = jiwer.process_words(
            gold_data[fichier],
            [tiny_text],
        )
        print(jiwer.visualize_alignment(out_tiny))

        out_small = jiwer.process_words(
            gold_data[fichier],
            [small_text],
        )
        print(jiwer.visualize_alignment(out_small))


#gold data as a dict with only audio name and transcription
def process_json_gold_data(path):
    """
    gets the gold data from the json file and create a dictionnary with name of the file and transcription for later comparison

    :param path str: path to the json file
    """

    with open(path, 'r') as file:
        data = json.load(file)

    # match the pattern and only keep the name of the video
    pattern = re.compile(r'(\d+_\d+).MP4')

    #store the results
    result = {}

    # Iterate through the items and apply the regex
    for item in data:
        video_path = item.get("video_path", "")
        match = pattern.search(video_path)
        if match:
            # construct the new key with .wav to correspond to the audio files
            new_key = match.group(1) + ".wav"
            # Assign transcription as values
            result[new_key] = item.get("transcription_timestamp", "")

    return result

def list_subfolders(mother_folder):
    """
    Returns a list of names of all subfolders within the given mother folder

    :param mother_folder: Path to the mother folder
    :return: List of subfolder names
    """
    subfolders = [f.name for f in os.scandir(mother_folder) if f.is_dir()]
    return subfolders

def compute_wer(json_file, gold_data):
    """
    Computes the Word Error Rate for transcriptions in a JSON file against gold data

    :param json_file: Path to the JSON file containing transcriptions
    :param gold_data: Dictionary with filenames as keys and gold transcriptions as values
    :return: Dictionary with WER results
    """
    # Read JSON file
    with open(json_file, 'r') as file:
        data = json.load(file)

    results = {}

    for file_info in data['files']:
        filename = file_info['filename']
        if filename in gold_data:
            gold_transcription = gold_data[filename]
            tiny_transcription = file_info['transcriptions']['tiny']
            small_transcription = file_info['transcriptions']['small']

            # Compute WER
            wer_tiny = jiwer.wer(gold_transcription, tiny_transcription)
            wer_small = jiwer.wer(gold_transcription, small_transcription)

            # Store results
            results[filename] = {
                'tiny_wer': wer_tiny,
                'small_wer': wer_small
            }

    return results


# Function to calculate the arithmetic mean of 'tiny_wer' and 'small_wer' from a json file
def calculate_wer_means(filename):
    """
    calculate the arithmetic mean of 'tiny_wer' and 'small_wer' from a json file

    :param filename

    """
    # Read the JSON file and load it into a dictionary
    with open(filename, 'r') as file:
        data = json.load(file)

    # Extract all 'tiny_wer' and 'small_wer' values
    tiny_wer_values = [details['tiny_wer'] for details in data.values()]
    small_wer_values = [details['small_wer'] for details in data.values()]

    # Calculate the arithmetic means
    tiny_wer_mean = mean(tiny_wer_values)
    small_wer_mean = mean(small_wer_values)

    return tiny_wer_mean, small_wer_mean



# Transcriptions for not noisy speech data

In [None]:
#original speech files
ori_transcriptions = transcribeAllFiles(tiny_model, small_model, path_to_folder= '/content/drive/My Drive/wav_data/')

create_json_file(ori_transcriptions, "original_transcriptions", '/content/drive/My Drive/transcriptions/original_transcriptions.json')

#--------------------
#display_transcriptions(transcriptions)

#visualisation_evaluation(gold_data, transcriptions)

   is technology making our attention span shorter?     ah technology is definitely making our attention span shorter. um  with social media. with apps. uh you get your one minute video and you know these days if it's  anything longer than that we can't be bothered.   um we only wanna you know read the comments we only wanna you know go through and get the gist of things and  um I know that if  you know you   see a long paragraph on the internet. you will look and most people say well I'm not reading that.  can someone tell me what it says? it's too long I'm not reading it.   ah so I think that um different apps particularly Instagram where everything's gotta be   quick and you know you you gotta get the short version of something in order to care.  uh I think that's definitely making um our attention span shorter.  um  social media, Facebook  you know you make your short post. if the post's too long no one wants to read it.  um even YouTube videos you know once we see oh this video  f

# Transcription of noisy speech

In [None]:
#noisy speeches

folder = '/content/drive/My Drive/noisy_speech_0dbSNR'

tracker = EmissionsTracker()
tracker.start()

!mkdir "transcriptions_0dbSNR" #change for 10dbSNR

for category in list_subfolders(folder):
    !mkdir $category
    if category not in ("cat", "chainsaw", "crickets", "crying_baby", "engine"):
        transcriptions = transcribeAllFiles(tiny_model, small_model, path_to_folder= f'{folder}/{category}')
        create_json_file(transcriptions, category, f'/content/drive/My Drive/transcriptions_0dbSNR/{category}.json')

emissions : float = tracker.stop()
print(f"Emissions: {emissions} kg")

# WER computation

In [5]:
gold_data = process_json_gold_data('/content/drive/My Drive/gold/cleaned_transcriptions.json')

categories = ('original_transcriptions', 'cat', 'chainsaw', 'crickets', 'crying_baby', 'engine', 'glass_breaking', 'helicopter', 'keyboard_typing', 'laughing', 'vacuum_cleaner')
SNR_10db = {}
SNR_0db = {}


for cat in categories:
    snr = ''
    if cat != 'original_transcriptions':
        snr= '_0dbSNR'
    create_json_file_WER(compute_wer(f'/content/drive/My Drive/transcriptions{snr}/{cat}.json', gold_data), f'/content/drive/My Drive/WERs/WERs_0dbSNR/wer_{cat}.json' )
    print(snr + " " + cat + " tiny: " + str(calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_0dbSNR/wer_{cat}.json')[0]) + " small: " + str(calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_0dbSNR/wer_{cat}.json')[1]))
    SNR_0db[cat]= calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_0dbSNR/wer_{cat}.json')



for cat in categories:
    snr = ''
    if cat != 'original_transcriptions':
        snr= '_10dbSNR'
    create_json_file_WER(compute_wer(f'/content/drive/My Drive/transcriptions{snr}/{cat}.json', gold_data), f'/content/drive/My Drive/WERs/WERs_10dbSNR/wer_{cat}.json' )
    print(snr + " " + cat + " tiny: " + str(calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_10dbSNR/wer_{cat}.json')[0]) + " small: " + str(calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_10dbSNR/wer_{cat}.json')[1]))
    SNR_10db[cat]= calculate_wer_means(f'/content/drive/My Drive/WERs/WERs_10dbSNR/wer_{cat}.json')


print(SNR_10db)
print(SNR_0db)



   is technology making our attention span shorter?     ah technology is definitely making our attention span shorter. um  with social media. with apps. uh you get your one minute video and you know these days if it's  anything longer than that we can't be bothered.   um we only wanna you know read the comments we only wanna you know go through and get the gist of things and  um I know that if  you know you   see a long paragraph on the internet. you will look and most people say well I'm not reading that.  can someone tell me what it says? it's too long I'm not reading it.   ah so I think that um different apps particularly Instagram where everything's gotta be   quick and you know you you gotta get the short version of something in order to care.  uh I think that's definitely making um our attention span shorter.  um  social media, Facebook  you know you make your short post. if the post's too long no one wants to read it.  um even YouTube videos you know once we see oh this video  f

"\nwer_cat = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/cat.json', gold_data)\nwer_chainsaw = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/chainsaw.json', gold_data)\nwer_crickets = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/crickets.json', gold_data)\nwer_crying_baby = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/crying_baby.json', gold_data)\nwer_original_transcriptions = compute_wer('/content/drive/My Drive/transcriptions/original_transcriptions.json', gold_data)\nwer_engine = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/engine.json', gold_data)\nwer_glass_breaking = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/glass_breaking.json', gold_data)\nwer_helicopter = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/helicopter.json', gold_data)\nwer_keyboard_typing = compute_wer('/content/drive/My Drive/transcriptions_0dbSNR/keyboard_typing.json', gold_data)\nwer_laughing = compute_wer('/conte