<a href="https://colab.research.google.com/github/ArielMobileLab/Archive/blob/main/whisper_timestamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip3 install whisper-timestamped

Collecting whisper-timestamped
  Downloading whisper_timestamped-1.15.4-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.5/53.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting dtw-python (from whisper-timestamped)
  Downloading dtw_python-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.5/770.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai-whisper (from whisper-timestamped)
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [

Cell 1: Mount Google Drive and Set Up Folders

In [2]:
# Import necessary libraries
import os
from google.colab import drive

# Mount Google Drive
drive.mount("/content/drive", force_remount=True)  # This will prompt for authorization

# Define paths for audio files and transcription files
audio_folder_path = "/content/drive/MyDrive/Whisper/Audio/"
transcription_folder_path = "/content/drive/MyDrive/Whisper/Transcriptions/"

# Create "Transcriptions" folder if it doesn't exist
if not os.path.exists(transcription_folder_path):
    os.makedirs(transcription_folder_path)

print("Google Drive mounted successfully.")
print(f"Audio files will be read from: {audio_folder_path}")
print(f"Transcription files will be saved to: {transcription_folder_path}")


Mounted at /content/drive
Google Drive mounted successfully.
Audio files will be read from: /content/drive/MyDrive/Whisper/Audio/
Transcription files will be saved to: /content/drive/MyDrive/Whisper/Transcriptions/


Cell 2: List Audio Files

In [3]:
# Get a list of all the file paths in the folder
audio_files = [os.path.join(audio_folder_path, file) for file in os.listdir(audio_folder_path) if file.endswith(('.wav', '.mp3', '.m4a'))]

# Print the file paths
for p in audio_files:
    print(p)

# Print the total number of audio files
print(f"\033[1mThere are {len(audio_files)} audio files to transcribe.")


/content/drive/MyDrive/Whisper/Audio/recording.wav
[1mThere are 1 audio files to transcribe.


Cell 3: Install Whisper Timestamp

Cell 4: Define the Transcription Function

In [14]:
# Import necessary libraries
import subprocess
import json

# Function to transcribe using a specific model and adjust the start time of the first word
def transcribe_and_adjust_timestamps(audio_file, transcription_folder_path):
    base_name = os.path.basename(audio_file)

    # Run Whisper with medium model
    medium_command = f"whisper_timestamped \"{audio_file}\" --model medium --language he --output_dir \"{transcription_folder_path}\""
    subprocess.run(medium_command, shell=True, check=True)

    # Determine the name of the output file for the medium model
    medium_output_file = os.path.join(transcription_folder_path, base_name + ".words.json")

    # Read the medium model output to get the start time of the first word
    with open(medium_output_file, 'r') as f:
        medium_result = json.load(f)
    start_time_first_word = medium_result['segments'][0]['words'][0]['start']

    # Run Whisper with large model
    large_command = f"whisper_timestamped \"{audio_file}\" --model large --language he --output_dir \"{transcription_folder_path}\""
    subprocess.run(large_command, shell=True, check=True)

    # Determine the name of the output file for the large model
    large_output_file = os.path.join(transcription_folder_path, base_name + ".words.json")

    # Read the large model output and adjust the start time of the first word
    with open(large_output_file, 'r') as f:
        large_result = json.load(f)
    if large_result['segments']:
      large_result['segments'][0]['start'] += start_time_first_word


    # Adjust only the first word's start time in the first segment
    if large_result['segments'] and large_result['segments'][0]['words']:
        large_result['segments'][0]['words'][0]['start'] += start_time_first_word

    # Save the adjusted transcription to a new JSON file
    adjusted_output_file = os.path.join(transcription_folder_path, base_name + "_adjusted.words.json")
    with open(adjusted_output_file, 'w') as f:
        json.dump(large_result, f, ensure_ascii=False, indent=2)

    print(f"Transcription for {audio_file} saved to {adjusted_output_file}")


In [15]:
# Process each audio file
for audio_file in audio_files:
    transcribe_and_adjust_timestamps(audio_file, transcription_folder_path)

print(f"\033[1mProcessing completed. Transcriptions saved to {transcription_folder_path}")


Transcription for /content/drive/MyDrive/Whisper/Audio/recording.wav saved to /content/drive/MyDrive/Whisper/Transcriptions/recording.wav_adjusted.words.json
[1mProcessing completed. Transcriptions saved to /content/drive/MyDrive/Whisper/Transcriptions/


## combine data

In [16]:
import json
from datetime import datetime, timedelta

# Define the calculate_recording_time function
def calculate_recording_time(recording_data):
    # Extracting World_time values and converting them to datetime objects
    world_times = [datetime.strptime(segment['World_time'], '%H:%M:%S.%f') for segment in recording_data]

    # Calculating the recording time by subtracting the first World_time from each subsequent World_time
    recording_times = [(time - world_times[0]).total_seconds() for time in world_times]

    return recording_times


In [17]:
# Define the file paths
adjusted_output_file = "/content/drive/MyDrive/Whisper/Transcriptions/recording.wav_adjusted.words.json"
recording_data_file = "/content/drive/MyDrive/Whisper/Transcriptions/recording_data.json"

# Load data from the adjusted transcription JSON file
with open(adjusted_output_file, "r") as file:
    recording = json.load(file)

# Load data from recording_data.json
with open(recording_data_file, "r") as file:
    recording_data = json.load(file)

# Initialize list to store segments
segments = []

# Calculate recording times
recording_times = calculate_recording_time(recording_data)

# Iterate through each segment in recording data
for segment in recording['segments']:
    # Extract start time from the segment
    start_time = segment['start']

    # Calculate duration of the segment
    duration = segment['end'] - segment['start']

    # Flag to check if a match is found
    match_found = False

    # Iterate through segments in modified data
    for i, audio_segment in enumerate(recording_data):
        # Check if start time matches world time
        if abs(start_time - recording_times[i]) < 1:  # Adjust the tolerance as needed
            # Extract world time from the audio segment
            world_time = audio_segment["World_time"]

            # Extract other relevant information
            simulation_time = audio_segment["simulation_time"]

            # Calculate end simulation time
            end_simulation_time = simulation_time + duration

            # Create a dictionary for the segment
            segment_info = {
                "text": segment["text"],
                "start_time": str(timedelta(seconds=start_time)),
                "end_time": str(timedelta(seconds=start_time + duration)),
                "world_time": world_time,
                "simulation_time": simulation_time,
                "end_simulation_time": end_simulation_time
            }

            # Append segment to list of segments
            segments.append(segment_info)
            match_found = True
            break  # Break the loop if a matching segment is found

    # Check if a match was found for the current segment
    if not match_found:
        print(f"No matching record time found for start time: {start_time}")

# Write the segments to a new JSON file
combined_output_file = "/content/drive/MyDrive/Whisper/Transcriptions/combined_segments.json"
with open(combined_output_file, "w") as file:
    json.dump(segments, file, indent=4)

print(f"Combined segments saved to {combined_output_file}")


Combined segments saved to /content/drive/MyDrive/Whisper/Transcriptions/combined_segments.json


In [18]:
# Load and display the combined output file
combined_output_file = "/content/drive/MyDrive/Whisper/Transcriptions/combined_segments.json"

with open(combined_output_file, "r") as file:
    combined_segments = json.load(file)

# Display the contents of the combined output file
print(json.dumps(combined_segments, indent=4, ensure_ascii=False))


[
    {
        "text": " אחד אני נוסע",
        "start_time": "0:00:02.560000",
        "end_time": "0:00:05.940000",
        "world_time": "15:38:37.168",
        "simulation_time": 251.33334644153032,
        "end_simulation_time": 254.71334644153032
    },
    {
        "text": " שמאלה",
        "start_time": "0:00:07.980000",
        "end_time": "0:00:08.760000",
        "world_time": "15:38:42.587",
        "simulation_time": 255.0666799695743,
        "end_simulation_time": 255.8466799695743
    },
    {
        "text": " חמישים ושלוש",
        "start_time": "0:00:10.120000",
        "end_time": "0:00:11.400000",
        "world_time": "15:38:44.729",
        "simulation_time": 256.56668004780624,
        "end_simulation_time": 257.8466800478062
    },
    {
        "text": " קדימה",
        "start_time": "0:00:12.840000",
        "end_time": "0:00:13.460000",
        "world_time": "15:38:47.462",
        "simulation_time": 258.46668014690005,
        "end_simulation_time": 259.0

## combine through tsv file