In [None]:
from moviepy import VideoFileClip
import os

In [None]:
import sys
print(sys.executable)

In [None]:
!uv pip list | grep -i moviepy

In [None]:
def extract_audio(video_path: str, output_dir: str = "../data/raw/", output_ext='.wav'):
    """
    Extracts audio from the video and saves it as a WAV file. 
    """
    # create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    # generate the file name
    filename = os.path.basename(video_path).split('.')[0]
    output_path = os.path.join(output_dir, f'{filename}{output_ext}')
    
    if os.path.exists(output_path):
        print(f"Audio already exists at : {output_path}")
        return output_path
    
    try:
        video_clip = VideoFileClip(video_path)
        
        if video_clip.audio is None:
            print("Error: This video has no sound")
            return None
        
        video_clip.audio.write_audiofile(output_path, logger='bar')
        video_clip.close()
        return output_path
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
path = extract_audio(video_path="../data/raw/Interview_2.mp4")

In [1]:
audio_path = "../data/raw/Interview_2.wav"

## Feature Engineering for Audio Data

In [1]:
import librosa
import numpy as np
import pandas as pd
import os


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/adi_6200_/mmr/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/adi_6200_/mmr/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/adi_6200_/mmr/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 758, in start
    self.io_loop.start()
  File "/h

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



In [None]:
y, sr = librosa.load("../data/raw/Interview_1.wav", sr=None)

In [None]:
type(y)

In [None]:
y.shape

In [None]:
sr

In [None]:
y.shape[0]/(sr*60) # this should give the duration of the visdeo

In [None]:
def analyze_audio_layers(audio_path: str, segment_length: float=0.5) -> pd.DataFrame:
    """
    Input: 
        audio_path: path to the audio file
        segment_length: time window in secs (same as video dataframe)
    Output:
        au_data: dataframe with TS features for analysis (Ready to go to the data analysis pipeline)
    """
    # check is the audio file exists
    if not os.path.exists(audio_path):
        print(f"Error: {audio_path} does not exit")
        return None
    
    # loading audio file
    y, sr = librosa.load(audio_path, sr=None)
    
    # total duration
    total_duration = librosa.get_duration(y=y, sr=sr)
    
    au_data = []
    
    # iterating through the audio chunks
    for t in np.arange(0, total_duration, segment_length):
        
        # calculating the starting and ending indexes for this chunk
        start_sample = int(t * sr)
        end_sample = int((t + segment_length)*sr)
        
        # getting the chunk for this iteration
        chunk = y[start_sample:end_sample]
        
        # check if the file ended
        if len(chunk) == 0: break
        
        # FEATURE - 1: AMPLITUDE (Confidence/Volume)
        rms = np.mean(librosa.feature.rms(y=chunk))
        
        # FEATURE - 2: SILENCE DETECTION
        # Threshold: 0.005 is a standard "noise floor" for webcams
        is_silent = rms < 0.005
        
        # FEATURE 3 & 4: PITCH TRACKING (Monotone vs Expressive)
        avg_pitch = 0
        pitch_var = 0
        
        # if not silent
        if not is_silent:
            f0, voiced_flag, _ = librosa.pyin(
                chunk,
                fmin=librosa.note_to_hz('C2'),
                fmax=librosa.note_to_hz('C5'),
                sr=sr,
                frame_length=2048
            )
            
            # filtering out the NaNs (moments of unvoiced sound)
            valid_pitch = f0[~np.isnan(f0)]
            
            if len(valid_pitch) > 0:
                avg_pitch = np.mean(valid_pitch)
                # I think this is super cool this pitch var effectively measures you expressiveness
                pitch_var = np.std(valid_pitch) 
                
        # creating the row
        au_data.append({
            "Time": round(t, 2),
            "audio_rms(volumn)": round(rms, 4),
            "audio_pitch_avg": round(avg_pitch, 2),
            "audio_pitch_var(expressiveness)": round(pitch_var, 2),
            "is_silent": is_silent
        })
        
    # converting into a dataframe
    au_data = pd.DataFrame(au_data)
    au_data = au_data.sort_values('Time').reset_index(drop=True)
    
    return au_data
                  

In [None]:
df = analyze_audio_layers(audio_path="../data/raw/Interview_1.wav")

In [None]:
df.head(10)

In [None]:
df.tail()

In [None]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print("Added project root to the system path")


In [None]:
os.getcwd()

In [None]:
from src.utils.plot_graphs import plot_beautiful

plot_beautiful(x=df['Time'], y=df['audio_rms(volumn)'], title="audio_rms(volumn)")

In [None]:
plot_beautiful(x=df['Time'], y=df['audio_pitch_var(expressiveness)'], title="audio_pitch_var(expressiveness)")

In [None]:
plot_beautiful(x=df['Time'], y=df['audio_pitch_avg'])

## Trying to Build the 3rd layer

In [None]:
import whisper

In [None]:
# loading model
model = whisper.load_model("base")

In [None]:
# audio = whisper.load_audio(file="../data/raw/Interview_1.wav")
# trim_audio = whisper.pad_or_trim(audio)

In [None]:
# mel = whisper.log_mel_spectrogram(trim_audio, n_mels=model.dims.n_mels).to(model.device)

In [None]:
# # detecting the language
# _, probs = model.detect_language(mel)
# print(f"Detected language: {max(probs, key=probs.get)}")

In [None]:
# # decoding the audio
# options = whisper.DecodingOptions()
# result = whisper.decode(model, mel=mel, options=options)
# print(result.text)

In [None]:
result = model.transcribe(audio="../data/raw/Interview_1.wav")
print(result["text"])

## Using Another version of whisper

In [None]:
import whisper_timestamped as wp

In [None]:
audio = wp.load_audio(file=audio_path)
model = wp.load_model('small', device='cpu')

In [None]:
result = wp.transcribe_timestamped(
    model=model,
    audio=audio,
    language=None,
    detect_disfluencies=True
)

In [None]:
result

In [None]:
tr_df = pd.DataFrame(result["segments"])

In [None]:
tr_df.head()

In [None]:
tr_df["words"].iloc[2]

## Trying to get who is talking in the audio

In [None]:
import os
from dotenv import load_dotenv
import assemblyai as aai
load_dotenv()
api_key=os.getenv("ASSEMBLYAI_API_KEY")

In [None]:
aai.settings.api_key=api_key

In [None]:
aai.settings.http_timeout = 600

In [None]:
# transcriber = aai.Transcriber()
# transcript = transcriber.transcribe(
#     data="../data/raw/Interview_2.wav",
#     config=aai.TranscriptionConfig(speaker_labels=True)
# )

In [None]:
config = aai.TranscriptionConfig(speech_models=["universal"])

In [None]:
transcript = aai.Transcriber(config=config).transcribe(audio_path)
if transcript.status == 'error':
    raise RuntimeError(f"Transcription Failed: {transcript.error}")

In [None]:
# checking the size of the file
filesize = os.path.getsize('../data/raw/Interview_2_com.m4a')/(1024*1024)
filesize

In [2]:
import torch
import whisperx

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16"

In [4]:
model = whisperx.load_model(
    'small',
    device,
    compute_type=compute_type
)

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: Could not import module 'Pipeline'. Are this object's requirements defined correctly?

In [6]:
import torchvision  # Add this first

# Your existing code
import whisperx
device = "cpu"
compute_type = "float16"  # e.g., "float16" or "int8"
model = whisperx.load_model('small', device, compute_type=compute_type)

AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [8]:
import sys
import importlib
if 'torchvision' in sys.modules:
    importlib.reload(sys.modules['torchvision'])
import torchvision
print("Success:", hasattr(torchvision, 'extension'))  # Should print True

AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)