In [1]:
import pandas as pd

In [2]:
def convert_rttm_to_csv(file):
    """
    Inputs:
    file: str
        file path of the rttm file to be converted

    Outputs:
    df: dataframe
        Dataframe containing the extracted information from the rttm file
    """
    # read the file
    df = pd.read_csv(file, delimiter=" ", header=None)
    df = df[[3, 4, 7]]
    df.columns = ['start_time', 'duration', 'speaker_name']
    # compute the end time
    df['end_time'] = df['start_time'] + df['duration']
    # convert time to miliseconds
    df['start_time'] *= 1000
    df['end_time'] *= 1000
    # sort the df based on the start_time
    df.sort_values(by=['start_time'], inplace=True)
    # return
    return df


In [3]:
check = convert_rttm_to_csv("diarizationoutput.rttm")

In [5]:
import torch
import whisper
import librosa

# load whisper model
torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = whisper.load_model("tiny.en", device='cuda')

In [6]:
audio_path = "C:\\Users\\akish\Documents\\ML_Projects\\whisper\Medium_model_results\\new_audio\output_audio_shgn7_s001_s001_t002.mp3"

In [7]:
from pydub import AudioSegment

#importing file from location by giving its path
sound = AudioSegment.from_mp3(audio_path)

#Selecting Portion we want to cut
# StrtTime = 650.0
# EndTime = 14606.0

# Opening file and extracting portion of it


In [14]:
# extract = sound[StrtTime:EndTime]

# # Saving file in required location
# extract.export("portion.mp3", format="mp3")

# new file portion.mp3 is saved at required location

<_io.BufferedRandom name='portion.mp3'>

In [15]:
#transcribe the audio using the model
# transcription = model.transcribe("portion.mp3")

In [13]:
check

Unnamed: 0,start_time,duration,speaker_name,end_time
0,650.0,13.956,SPEAKER_01,14606.0
1,2185.0,0.742,SPEAKER_00,2927.0
2,6168.0,0.540,SPEAKER_00,6708.0
3,10032.0,0.456,SPEAKER_00,10488.0
4,14723.0,5.096,SPEAKER_00,19819.0
...,...,...,...,...
291,1739517.0,8.994,SPEAKER_00,1748511.0
292,1749372.0,4.438,SPEAKER_00,1753810.0
293,1754806.0,0.979,SPEAKER_01,1755785.0
294,1756223.0,0.456,SPEAKER_00,1756679.0


In [22]:
# create a csv file where first column is the speaker name, second column is Start time, 
# third column is End time, fourth column is the transcription

with open('transcription.csv', 'a') as f:
    for i in range(len(check)):
        timings = [check.iloc[i]['start_time'], check.iloc[i]['end_time']]
        # get the transcription for the timings 
        extract = sound[timings[0]:timings[1]]
        extract.export("portion.mp3", format="mp3")
        transcription = model.transcribe("portion.mp3")

        #convert timings to hrs, mins, secs and ms format
        start_time = str(timings[0]//3600000) + ":" + str((timings[0]//60000)%60) \
                    + ":" + str((timings[0]//1000)%60) + ":" + str(timings[0]%1000)

        end_time = str(timings[1]//3600000) + ":" + str((timings[1]//60000)%60) \
                    + ":" + str((timings[1]//1000)%60) + ":" + str(timings[1]%1000)

        f.write(check.iloc[i]['speaker_name'] + "," + start_time + "," + end_time + "," + transcription['text']+"\n")

In [9]:
# create a list of the start and end time for each speaker form check dataframe

# get unique speakers from the dataframe
unique_speakers = check['speaker_name'].unique()
# crate a dataframe with start and end times where speaker_name = unique_speakers[0]
df_speaker00 = check[check['speaker_name'] == unique_speakers[0]]
# create a list of start and end times
speaker00 = df_speaker00[['start_time', 'end_time']].values.tolist()

#create a dataframe with start and end times where speaker_name = unique_speakers[1]
df_speaker01 = check[check['speaker_name'] == unique_speakers[1]]
# create a list of start and end times
speaker01 = df_speaker01[['start_time', 'end_time']].values.tolist()

In [12]:
speaker00, speaker01

([[650.0, 14606.0],
  [36914.0, 37859.0],
  [46567.0, 52608.00000000001],
  [53350.0, 60758.0],
  [75963.0, 83488.99999999999],
  [94475.0, 95454.0],
  [99250.0, 101410.0],
  [102406.0, 125997.00000000001],
  [127567.0, 130537.0],
  [138502.0, 139380.0],
  [142568.0, 143749.00000000003],
  [145943.0, 147428.00000000003],
  [153470.0, 154972.0],
  [163105.0, 168606.0],
  [194864.0, 196619.0],
  [224969.0, 227804.0],
  [261233.0, 263815.0],
  [299287.0, 302274.0],
  [335922.0, 337711.0],
  [354637.0, 356055.0],
  [370010.0, 373976.0],
  [394681.0, 396621.99999999994],
  [404738.0, 406021.0],
  [430827.0, 435788.0],
  [444243.0, 445694.0],
  [468880.0, 470129.0],
  [483747.0, 486076.0],
  [501432.0, 503136.0],
  [510950.0, 512789.0],
  [542675.0, 545257.0],
  [558250.0, 559668.0],
  [602142.0, 605129.0],
  [617701.0, 622662.0],
  [625885.0, 628247.0],
  [635858.0, 638794.0],
  [664647.0, 669473.0000000001],
  [676207.0, 680780.0],
  [710648.0, 714614.0],
  [738695.0, 740922.0],
  [773677.

In [10]:
#create a file with transcription for speaker00
sound = AudioSegment.from_mp3(audio_path)
for times in speaker00:
    timings = [times[0], times[1]]
    # get the transcription for the timings 
    extract = sound[timings[0]:timings[1]]
    extract.export("portion.mp3", format="mp3")
    transcription = model.transcribe("portion.mp3")
    # save the transcription in a file
    with open("speaker00.txt", "a") as f:
        f.write("<"+str(times[0])+"> " + "<"+str(times[1])+"> " + transcription['text'] + '\n')
    
        

In [11]:
for times in speaker01:
    timings = [times[0], times[1]]
    # get the transcription for the timings 
    extract = sound[timings[0]:timings[1]]
    extract.export("portion.mp3", format="mp3")
    transcription = model.transcribe("portion.mp3")
    # save the transcription in a file
    with open("speaker01.txt", "a") as f:
        f.write("<"+str(times[0])+"> " + "<"+str(times[1])+"> "+ transcription['text'] + '\n')

####Structre
# 
# |timestamps||speaker 0| |speaker 1|
# |---|---|---|---|
#for EDL file

---> export join csv
--->