In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import re
import os
import sys
import csv
import nltk
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Parameters Initialization

In [3]:
base_path = 'drive/MyDrive/ml4science'

# Specify the directory for audio files
audio_directory = f'{base_path}/wavs/'

# Specify the directory for the JSON files
json_directory = f'{base_path}/json/'

# Specify the path for the 1st CSV file transcripts_teams.csv
csv_file_path_1 = f'{base_path}/csv/transcripts_teams.csv'

# Specify the path for the 2nd CSV file transcripts_speakers.csv
csv_file_path_2 = f'{base_path}/csv/transcripts_speakers.csv'

# Specify the path for the 2nd CSV file speaking_time.csv
csv_file_path_3 = f'{base_path}/csv/speaking_time.csv'

# List all files in the audio directory
audio_files = sorted([f for f in os.listdir(audio_directory) if os.path.isfile(os.path.join(audio_directory, f))])

# List all the files in json directory
json_files = sorted([f for f in os.listdir(json_directory) if os.path.isfile(os.path.join(json_directory, f))])

# Import the helpers.py functions
sys.path.append(base_path)
from helpers import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creation of a CSV file `transcripts_teams.csv` with the JSON ones containing :
- `team_id` : id number of the team (name of the audio file)
- `initial_transcript` : the initial transcript from whisperX without any modifications
- `filtered_transcript` : the filtered transcript which contains only the words with a level of confidence (score) > 0.5
- `clean_final_transcript` : the final clean transcript filtered with the clean_text() function

See the helpers.py file for the : `extract_transcript_with_scores()` function.

*Note :* Running time of 12 s with the V100 GPU

In [4]:
# Open the CSV file in write mode
with open(csv_file_path_1, 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(['team_id', 'initial_transcript', 'filtered_transcript', 'clean_final_transcript'])

    # Iterate through each audio file :
    for audio_file in audio_files:

        # Get the filename (without extension)
        file_name = os.path.splitext(os.path.basename(audio_file))[0]

        # Specify the JSON file path
        json_file_path = os.path.join(json_directory, f'transcript_{file_name}.json')

        if os.path.exists(json_file_path):
            # Load the content from the JSON file
            with open(json_file_path, 'r') as jsonfile:
                result = json.load(jsonfile)

            # Extracting the initial transcript (all scores)
            initial_transcript, _ = extract_transcript_with_scores(result)

            # Extracting the filtered transcript (scores > 0.5)
            filtered_transcript, _ = extract_transcript_with_scores(result, min_score=0.5)

            # Cleaning the final text with the clean_text() function
            clean_final_transcript = clean_text(filtered_transcript)

            # Write the information to the CSV file
            csv_writer.writerow([file_name, initial_transcript, filtered_transcript, clean_final_transcript])

        else:
            print(f"JSON file not found for {audio_file}. Skipping...")

    print(f"\nCSV file created at {csv_file_path_1}")


CSV file created at drive/MyDrive/ml4science/csv/transcripts_teams.csv


## Creation of a CSV file `transcripts_speakers.csv` with the JSON ones containing :
- `team_id` : id number of the team (name of the audio file)
- `speaker_id` : id number of the team speaker
- `speaker_initial_transcript` : initial transcript segments of the speaker
- `speaker_filtered_transcript` : filtered transcript segments of the speaker with a level of confidence (score) > 0.5
- `speaker_clean_final_transcript` : final clean transcript segments of the speaker filtered with the clean_text() function

See the helpers.py file for the : `extract_transcript_from_segments()` & `extract_speaker_info()` functions.

*Note :* Running time of 14 s with the V100 GPU

In [5]:
# Open the CSV file in write mode
with open(csv_file_path_2, 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(['team_id', 'speaker_id', 'speaker_initial_transcript', 'speaker_filtered_transcript', 'speaker_clean_final_transcript'])

    # Iterate through each audio file:
    for audio_file in audio_files:
        # Get the filename (without extension)
        file_name = os.path.splitext(os.path.basename(audio_file))[0]

        # Specify the JSON file path
        json_file_path = os.path.join(json_directory, f'transcript_{file_name}.json')

        if os.path.exists(json_file_path):
            # Load the content from the JSON file
            with open(json_file_path, 'r') as jsonfile:
                result = json.load(jsonfile)

            # Extracting speaker information
            speaker_info = extract_speaker_info(result, file_name)

            # Clean the final transcripts and add to speaker_info
            for entry in speaker_info:
                entry['speaker_clean_final_transcript'] = clean_text(entry['speaker_filtered_transcript'])

            # Write the information to the CSV file
            for info in speaker_info:
                csv_writer.writerow([
                    info['team_id'],
                    info['speaker_id'],
                    info['speaker_initial_transcript'],
                    info['speaker_filtered_transcript'],
                    info['speaker_clean_final_transcript']
                ])
        else:
            print(f"JSON file not found for {audio_file}. Skipping...")

    print(f"\nCSV file created at {csv_file_path_2}")


CSV file created at drive/MyDrive/ml4science/csv/transcripts_speakers.csv


## Creation of a CSV file `speaking_time.csv` with the JSON ones containing :
- `Team_id` : id number of the team (name of the file)
- `speaker` : id number of the team speaker
- `length` : continue speaking time for each speaker in seconds

See the helpers.py file for the : `extract_speaking_time()` function.



*Note :* Running time of 7 s with the V100 GPU

In [6]:
# Create an empty Dataframe
data = pd.DataFrame()

# Iterate through each audio file :
for json_file in json_files:

    # Get the filename (without extension)
    file_name = os.path.splitext(os.path.basename(json_file))[0]

    # Specify the JSON file path
    json_file_path = os.path.join(json_directory, f'{file_name}.json')

    if os.path.exists(json_file_path):
        # Load the content from the JSON file
        with open(json_file_path, 'r') as jsonfile:
            result = json.load(jsonfile)

    s_cont = extract_speaking_time(result)

    # Add a 'Team_ID' column based on the file_name
    s_cont['Team_ID'] = file_name.split('_')[1]

    # Concatenate the dataframe vertically
    data = pd.concat([data, s_cont], ignore_index=True)

# Deleting all the Nan values
data = data.dropna()
data = data.reset_index(drop=True)

# Save the DataFrame to CSV
data.to_csv(csv_file_path_3, index=False)
print(f"\nCSV file created at {csv_file_path_2}")


CSV file created at drive/MyDrive/ml4science/csv/transcripts_speakers.csv
