In [1]:
from moviepy import VideoFileClip
import os

In [2]:
import sys
print(sys.executable)

c:\Users\BIT\Desktop\mmr\.venv\Scripts\python.exe


In [3]:
!uv pip list | grep -i moviepy

'grep' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
def extract_audio(video_path: str, output_dir: str = "../data/raw/", output_ext='.wav'):
    """
    Extracts audio from the video and saves it as a WAV file. 
    """
    # create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    # generate the file name
    filename = os.path.basename(video_path).split('.')[0]
    output_path = os.path.join(output_dir, f'{filename}{output_ext}')
    
    if os.path.exists(output_path):
        print(f"Audio already exists at : {output_path}")
        return output_path
    
    try:
        video_clip = VideoFileClip(video_path)
        
        if video_clip.audio is None:
            print("Error: This video has no sound")
            return None
        
        video_clip.audio.write_audiofile(output_path, logger='bar')
        video_clip.close()
        return output_path
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
path = extract_audio(video_path="../data/raw/Interview_2.mp4")

In [1]:
audio_path = "../data/raw/Interview_2.wav"

## Feature Engineering for Audio Data

In [7]:
import librosa
import numpy as np
import pandas as pd
import os

In [7]:
y, sr = librosa.load("../data/raw/Interview_1.wav", sr=None)

In [8]:
type(y)

numpy.ndarray

In [9]:
y.shape

(15259922,)

In [10]:
sr

44100

In [11]:
y.shape[0]/(sr*60) # this should give the duration of the visdeo

5.767166288737718

In [12]:
def analyze_audio_layers(audio_path: str, segment_length: float=0.5) -> pd.DataFrame:
    """
    Input: 
        audio_path: path to the audio file
        segment_length: time window in secs (same as video dataframe)
    Output:
        au_data: dataframe with TS features for analysis (Ready to go to the data analysis pipeline)
    """
    # check is the audio file exists
    if not os.path.exists(audio_path):
        print(f"Error: {audio_path} does not exit")
        return None
    
    # loading audio file
    y, sr = librosa.load(audio_path, sr=None)
    
    # total duration
    total_duration = librosa.get_duration(y=y, sr=sr)
    
    au_data = []
    
    # iterating through the audio chunks
    for t in np.arange(0, total_duration, segment_length):
        
        # calculating the starting and ending indexes for this chunk
        start_sample = int(t * sr)
        end_sample = int((t + segment_length)*sr)
        
        # getting the chunk for this iteration
        chunk = y[start_sample:end_sample]
        
        # check if the file ended
        if len(chunk) == 0: break
        
        # FEATURE - 1: AMPLITUDE (Confidence/Volume)
        rms = np.mean(librosa.feature.rms(y=chunk))
        
        # FEATURE - 2: SILENCE DETECTION
        # Threshold: 0.005 is a standard "noise floor" for webcams
        is_silent = rms < 0.005
        
        # FEATURE 3 & 4: PITCH TRACKING (Monotone vs Expressive)
        avg_pitch = 0
        pitch_var = 0
        
        # if not silent
        if not is_silent:
            f0, voiced_flag, _ = librosa.pyin(
                chunk,
                fmin=librosa.note_to_hz('C2'),
                fmax=librosa.note_to_hz('C5'),
                sr=sr,
                frame_length=2048
            )
            
            # filtering out the NaNs (moments of unvoiced sound)
            valid_pitch = f0[~np.isnan(f0)]
            
            if len(valid_pitch) > 0:
                avg_pitch = np.mean(valid_pitch)
                # I think this is super cool this pitch var effectively measures you expressiveness
                pitch_var = np.std(valid_pitch) 
                
        # creating the row
        au_data.append({
            "Time": round(t, 2),
            "audio_rms(volumn)": round(rms, 4),
            "audio_pitch_avg": round(avg_pitch, 2),
            "audio_pitch_var(expressiveness)": round(pitch_var, 2),
            "is_silent": is_silent
        })
        
    # converting into a dataframe
    au_data = pd.DataFrame(au_data)
    au_data = au_data.sort_values('Time').reset_index(drop=True)
    
    return au_data
                  

In [13]:
df = analyze_audio_layers(audio_path=audio_path)

In [14]:
df.head(10)

Unnamed: 0,Time,audio_rms(volumn),audio_pitch_avg,audio_pitch_var(expressiveness),is_silent
0,0.0,0.0373,0.0,0.0,False
1,0.5,0.0533,169.25,11.45,False
2,1.0,0.0006,0.0,0.0,True
3,1.5,0.0593,179.94,14.03,False
4,2.0,0.0646,237.84,56.17,False
5,2.5,0.0497,220.1,60.33,False
6,3.0,0.0114,388.2,7.03,False
7,3.5,0.0705,220.93,24.25,False
8,4.0,0.0591,247.08,31.9,False
9,4.5,0.0461,243.6,65.59,False


In [15]:
df.to_csv("../data/processed/technical_data/Interview_2.csv", index=False)

In [16]:
df.tail()

Unnamed: 0,Time,audio_rms(volumn),audio_pitch_avg,audio_pitch_var(expressiveness),is_silent
1259,629.5,0.0014,0.0,0.0,True
1260,630.0,0.0991,318.96,73.03,False
1261,630.5,0.0959,274.32,2.1,False
1262,631.0,0.113,270.77,9.63,False
1263,631.5,0.0675,337.66,48.11,False


In [17]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print("Added project root to the system path")


Added project root to the system path


In [18]:
os.getcwd()

'c:\\Users\\BIT\\Desktop\\mmr\\notebooks'

In [None]:
from src.utils.plot_graphs import plot_beautiful

plot_beautiful(x=df['Time'], y=df['audio_rms(volumn)'], title="audio_rms(volumn)")

In [None]:
plot_beautiful(x=df['Time'], y=df['audio_pitch_var(expressiveness)'], title="audio_pitch_var(expressiveness)")

In [None]:
plot_beautiful(x=df['Time'], y=df['audio_pitch_avg'])

## Trying to Build the 3rd layer

In [None]:
import whisper

In [None]:
# loading model
model = whisper.load_model("base")

In [None]:
# audio = whisper.load_audio(file="../data/raw/Interview_1.wav")
# trim_audio = whisper.pad_or_trim(audio)

In [None]:
# mel = whisper.log_mel_spectrogram(trim_audio, n_mels=model.dims.n_mels).to(model.device)

In [None]:
# # detecting the language
# _, probs = model.detect_language(mel)
# print(f"Detected language: {max(probs, key=probs.get)}")

In [None]:
# # decoding the audio
# options = whisper.DecodingOptions()
# result = whisper.decode(model, mel=mel, options=options)
# print(result.text)

In [None]:
result = model.transcribe(audio="../data/raw/Interview_1.wav")
print(result["text"])

## Using Another version of whisper

In [2]:
import whisper_timestamped as wp

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [3]:
audio = wp.load_audio(file=audio_path)
model = wp.load_model('small', device='cpu')

In [4]:
result = wp.transcribe_timestamped(
    model=model,
    audio=audio,
    language=None,
    detect_disfluencies=True
)

Detected language: English


100%|██████████| 63183/63183 [06:40<00:00, 157.61frames/s]


In [5]:
result

{'text': " We're starting now. So welcome to the interview. Let's begin with a simple question. Can you tell me a little bit about your background and what got you interested in ML engineering? Yeah, so right now I'm in third year of my college. I mean, I'm going to be in the sixth semester, but yeah, from the start of my college, I think from my second semester towards the end of it, I got interested in like machine learning. I got to know about machine learning from some channels, from YouTube channels and all. And I thought, yeah, it is an interesting field. I got to know some things about it. I took a course, a very famous course from Andrew Ng, which is ML specialization course, which is on course era. So in the break, in the summer break, which I've got in the college, I actually completed that course. And actually I was very intrigued by how like we do all this stuff, what we do in the like machine learning field, right? So it was very interesting. And like after that, I was jus

In [10]:
import pandas as pd
wbyw = pd.DataFrame(result['segments'][0]['words'])

In [11]:
wbyw.head()

Unnamed: 0,text,start,end,confidence
0,We're,0.0,0.32,0.483
1,starting,0.32,0.62,0.997
2,now.,0.62,0.94,0.993
3,[*],0.94,1.72,0.0
4,So,1.72,1.76,0.708


In [12]:
len(wbyw)

18

In [13]:
len(result['segments'])

92

#### Let's create this wordbyword dataframe

In [15]:
dfs = []
for sen in result['segments']:
    df = pd.DataFrame(sen['words'])
    dfs.append(df)

In [16]:
wbyw = pd.concat(dfs)

In [17]:
wbyw.head()

Unnamed: 0,text,start,end,confidence
0,We're,0.0,0.32,0.483
1,starting,0.32,0.62,0.997
2,now.,0.62,0.94,0.993
3,[*],0.94,1.72,0.0
4,So,1.72,1.76,0.708


In [18]:
len(wbyw)

1873

In [19]:
wbyw.drop(columns=['confidence'], inplace=True)

In [20]:
wbyw.head()

Unnamed: 0,text,start,end
0,We're,0.0,0.32
1,starting,0.32,0.62
2,now.,0.62,0.94
3,[*],0.94,1.72
4,So,1.72,1.76


In [23]:
from pathlib import Path

current_dir = Path.cwd()
project_root = current_dir.parent

In [24]:
path = project_root / 'data' / 'processed' / 'whisper_data' / 'wbyw.csv'
wbyw.to_csv(str(path), index=False)

In [26]:
tr_df = pd.DataFrame(result["segments"])

In [27]:
tr_df.head()

Unnamed: 0,id,seek,start,end,text,tokens,temperature,avg_logprob,compression_ratio,no_speech_prob,confidence,words
0,0,0,0.0,5.69,We're starting now. So welcome to the intervi...,"[50364, 492, 434, 2891, 586, 13, 407, 2928, 28...",0.0,-0.14937,1.573222,0.105431,0.837,"[{'text': 'We're', 'start': 0.0, 'end': 0.32, ..."
1,1,0,5.69,9.36,me a little bit about your background and wha...,"[50648, 385, 257, 707, 857, 466, 428, 3678, 29...",0.0,-0.14937,1.573222,0.105431,0.973,"[{'text': 'me', 'start': 5.69, 'end': 5.86, 'c..."
2,2,0,11.18,18.06,"Yeah, so right now I'm in third year of my co...","[50916, 865, 11, 370, 558, 586, 286, 478, 294,...",0.0,-0.14937,1.573222,0.105431,0.887,"[{'text': 'Yeah,', 'start': 11.18, 'end': 11.3..."
3,3,0,18.76,25.44,"but yeah, from the start of my college, I thi...","[51304, 457, 1338, 11, 490, 264, 722, 295, 452...",0.0,-0.14937,1.573222,0.105431,0.906,"[{'text': 'but', 'start': 18.76, 'end': 19.12,..."
4,4,2584,25.94,33.45,I got interested in like machine learning. I ...,"[50364, 286, 658, 3102, 294, 411, 3479, 2539, ...",0.0,-0.120762,1.753488,0.026963,0.885,"[{'text': 'I', 'start': 25.94, 'end': 26.2, 'c..."


In [29]:
tr_df.seek.unique()

array([    0,  2584,  5152,  7336,  9864, 12616, 15328, 18008, 20224,
       23032, 25400, 27888, 30736, 33664, 35856, 38648, 41240, 43656,
       45936, 48568, 51432, 53864, 56648, 59232, 62160])

In [28]:
tr_df.columns

Index(['id', 'seek', 'start', 'end', 'text', 'tokens', 'temperature',
       'avg_logprob', 'compression_ratio', 'no_speech_prob', 'confidence',
       'words'],
      dtype='object')

In [30]:
tr_df.drop(columns=['tokens', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob', 'confidence', 'words'], inplace=True)

In [31]:
path = project_root / 'data' / 'processed' / 'whisper_data' / 'Interview_2.csv'
tr_df.to_csv(str(path), index=False)

## Trying to get who is talking in the audio

In [26]:
import os
# import requests
from dotenv import load_dotenv
import assemblyai as aai
load_dotenv()
api_key=os.getenv("ASSEMBLYAI_API_KEY")

audio_path = "../data/raw/Interview_2.wav"

aai.settings.api_key = api_key

print("Uploading and transcribing...")
transcriber = aai.Transcriber()

# The SDK handles upload automatically
transcript = transcriber.transcribe(
    audio_path,  # Pass the file path directly
    config=aai.TranscriptionConfig(speaker_labels=True)
)

if transcript.status == aai.TranscriptStatus.error:
    print(f'Transcription failed: {transcript.error}')
else:
    print("Transcription complete!")
    
    # Save transcript
    with open("transcript_output.txt", 'w') as f:
        f.write(transcript.text)
    
    print(f"Preview: {transcript.text[:100]}")

Uploading and transcribing...
Transcription complete!
Preview: Alright, we're starting now, so welcome to the interview. Let's begin with a simple question. Can yo


In [34]:
transcript.utterances

[Utterance(text="Alright, we're starting now, so welcome to the interview. Let's begin with a simple question. Can you tell me a little bit about your background and what got you interested in ML Engineering?", start=80, end=9680, confidence=0.98873436, speaker='A', channel=None, words=[UtteranceWord(text='Alright,', start=80, end=280, confidence=0.82281494, speaker='A', channel=None), UtteranceWord(text="we're", start=280, end=480, confidence=0.9980469, speaker='A', channel=None), UtteranceWord(text='starting', start=480, end=800, confidence=0.9995117, speaker='A', channel=None), UtteranceWord(text='now,', start=800, end=1120, confidence=0.9980469, speaker='A', channel=None), UtteranceWord(text='so', start=1680, end=1960, confidence=0.9838867, speaker='A', channel=None), UtteranceWord(text='welcome', start=1960, end=2280, confidence=0.9998372, speaker='A', channel=None), UtteranceWord(text='to', start=2280, end=2440, confidence=1.0, speaker='A', channel=None), UtteranceWord(text='the'

In [35]:
# let's convert this into a dataframe
utt_data = []
for utt in transcript.utterances:
    row = {
        'text': utt.text,
        'start': utt.start,
        'end': utt.end,
        'confidence': utt.confidence,
        'speaker': utt.speaker,
        'channel': utt.channel,
        'words': utt.words,
        'translated_texts': utt.translated_texts
    }
    utt_data.append(row)
    
utt_data = pd.DataFrame(utt_data)
utt_data.head()

Unnamed: 0,text,start,end,confidence,speaker,channel,words,translated_texts
0,"Alright, we're starting now, so welcome to the...",80,9680,0.988734,A,,"[text='Alright,' start=80 end=280 confidence=0...",
1,"Yeah, so like right now I'm in third year of m...",11040,136520,0.975275,B,,"[text='Yeah,' start=11040 end=11600 confidence...",
2,That's a great background and it's always awes...,139650,162370,0.97751,A,,"[text=""That's"" start=139650 end=139890 confide...",
3,"Yeah, so handling or cleaning the data set is ...",163490,217580,0.963769,B,,"[text='Yeah,' start=163490 end=163930 confiden...",
4,Those are not always the right choices.,219100,221900,0.99893,B,,[text='Those' start=219100 end=219500 confiden...,


In [36]:
utt_data.to_csv("../data/processed/utterances_data/Interview_2.csv", index=False)