<a href="https://colab.research.google.com/github/AbdulAhadSiddiqui-0786/Voice-AI/blob/main/Voice_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytube pydub openai-whisper pyannote.audio torch torchvision torchaudio transformers -q


In [None]:
!pip install yt-dlp -q
from pydub import AudioSegment

url = "https://www.youtube.com/watch?v=4ostqJD3Psc"

# Download audio with yt-dlp
!yt-dlp -x --audio-format mp3 -o "call.%(ext)s" {url}

# Convert to wav, mono, 16kHz
audio = AudioSegment.from_file("call.mp3")
audio = audio.set_channels(1).set_frame_rate(16000)
audio.export("call.wav", format="wav")

print(" Audio downloaded & converted to WAV using yt-dlp!")


In [None]:
from pyannote.audio import Pipeline
from google.colab import userdata
from tqdm import tqdm
import soundfile as sf
from pydub import AudioSegment
import math
import os
import pickle

#  Hugging Face token
HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')

# 1. Load diarization model
print(" Loading diarization model...")
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=HUGGINGFACE_TOKEN
)
print(" Model loaded!")

# 2. Load audio and get duration
audio_file = "call.wav"
info = sf.info(audio_file)
duration = info.duration  # in seconds
print(f" Audio duration: {duration:.1f} sec")

# 3. Split into 30s chunks
chunk_size = 30_000  # in ms
audio = AudioSegment.from_wav(audio_file)
chunks = math.ceil(len(audio) / chunk_size)
print(f" Splitting into {chunks} chunks of 30s each...")

# 4. Process chunks with caching
speaker_segments = []
with tqdm(total=chunks, desc="Processing chunks") as pbar:
    for i in range(chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(audio))
        chunk_file = f"chunk_{i}.wav"
        cache_file = f"chunk_{i}_diarization.pkl"

        # Export chunk only if not exists
        if not os.path.exists(chunk_file):
            audio[start:end].export(chunk_file, format="wav")

        # Load from cache if exists
        if os.path.exists(cache_file):
            with open(cache_file, "rb") as f:
                diarization = pickle.load(f)
        else:
            # Run diarization on this chunk
            diarization = pipeline(chunk_file)
            with open(cache_file, "wb") as f:
                pickle.dump(diarization, f)

        # Extract speaker segments
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speaker_segments.append({
                "speaker": speaker,
                "start": turn.start + start / 1000,  # shift time
                "end": turn.end + start / 1000,
                "duration": turn.end - turn.start
            })

        pbar.update(1)

print(" Diarization complete!")
print(" Speakers found:", set([s["speaker"] for s in speaker_segments]))


In [None]:
import whisper

# Load Whisper model
model = whisper.load_model("small")   # small model = good balance (fast + accurate)

# Transcribe audio
result = model.transcribe("call.wav")
transcript = result["text"]

print(" Transcription complete!")
print("Sample transcript:", transcript[:250], "...")


In [None]:
from transformers import pipeline
import re

# 1. Talk-time ratio
talk_time = {}
for seg in speaker_segments:
    talk_time[seg["speaker"]] = talk_time.get(seg["speaker"], 0) + seg["duration"]

total_time = sum(talk_time.values())
talk_ratios = {sp: round((dur/total_time)*100, 2) for sp, dur in talk_time.items()}

# 2. Number of questions
num_questions = transcript.count("?")
extra_questions = len(re.findall(r"\b(what|why|how|when|where|can|do|is|are|does|did)\b", transcript.lower()))
question_count = max(num_questions, extra_questions)

# 3. Longest monologue
longest_monologue = max([seg["duration"] for seg in speaker_segments])

# 4. Sentiment
sentiment_analyzer = pipeline("sentiment-analysis")
sentiment = sentiment_analyzer(transcript[:500])[0]  # analyze first 500 chars

# 5. Actionable insight
insight = ""
if max(talk_ratios.values()) > 70:
    insight = "One speaker dominated the conversation. Allow more balanced talk-time."
elif question_count < 3:
    insight = "Too few questions were asked. Ask more questions to engage the customer."
else:
    insight = "Good balance, but could improve listening."

# Identify roles
sales_rep = max(talk_ratios, key=talk_ratios.get)
customer = min(talk_ratios, key=talk_ratios.get)


In [None]:
print("\n===== CALL QUALITY REPORT =====")
print("Talk-time ratio:", talk_ratios)
print("Questions asked:", question_count)
print("Longest monologue (sec):", round(longest_monologue, 2))
print("Call sentiment:", sentiment["label"], "| Confidence:", round(sentiment["score"], 2))
print("Actionable Insight:", insight)
print("Likely Sales Rep:", sales_rep, "| Likely Customer:", customer)
