<a href="https://colab.research.google.com/github/Athaxv/VoiceAI/blob/main/CallQualityAnalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Call Quality Analyzer

**By:** Atharv Gaur
**Date:** September 13, 2025

This notebook analyzes a sales call recording to extract key quality metrics, sentiment, and actionable insights.


In [1]:
print("Installing necessary packages...")
!pip install -q pytubefix ffmpeg-python assemblyai transformers torch

print("\nInstallations complete!")

Installing necessary packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.9/768.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h
Installations complete!


In [2]:
# @title Step 2: Configure API Key
import assemblyai as aai

# IMPORTANT: Replace "YOUR_API_KEY_HERE" with your actual AssemblyAI API key.
aai.settings.api_key = "2eea341f4de24170aa7b6a590d76c1c7"

print("AssemblyAI API key configured.")


AssemblyAI API key configured.


In [3]:
# @title Step 3: Extract Audio from YouTube
from pytubefix import YouTube
import os
import subprocess

# URL of the YouTube video to be analyzed
YOUTUBE_URL = "https://www.youtube.com/shorts/AwK-nzl1jHM"
AUDIO_FILE_MP4 = "call_audio.mp4"
AUDIO_FILE_WAV = "call_audio.wav"

print(f"Downloading audio from: {YOUTUBE_URL}")

# Download the best audio stream using pytube
yt = YouTube(YOUTUBE_URL)
audio_stream = yt.streams.filter(only_audio=True).first()
audio_stream.download(filename=AUDIO_FILE_MP4)

print(f"Successfully downloaded audio to {AUDIO_FILE_MP4}")

# Convert the downloaded mp4 file to a WAV file using ffmpeg.
# WAV is a standard format for many speech recognition models.
# We also convert it to mono channel and set a sample rate of 16kHz.
print("Converting MP4 to WAV...")
subprocess.run([
    'ffmpeg',
    '-i', AUDIO_FILE_MP4,
    '-ac', '1',          # Mono channel
    '-ar', '16000',      # 16kHz sample rate
    AUDIO_FILE_WAV,
    '-y'                 # Overwrite output file if it exists
], check=True)


print(f"Successfully converted audio to {AUDIO_FILE_WAV}")


Downloading audio from: https://www.youtube.com/shorts/AwK-nzl1jHM
Successfully downloaded audio to call_audio.mp4
Converting MP4 to WAV...
Successfully converted audio to call_audio.wav


In [4]:
# @title Step 4: Transcribe the Audio and Identify Speakers
import time

print("Starting transcription process... This may take a moment.")
start_time = time.time()

# Configure the transcription request with speaker identification (diarization) and sentiment analysis
config = aai.TranscriptionConfig(
    speaker_labels=True,
    sentiment_analysis=True,
    summarization=True,  # <-- THIS MUST BE TRUE
    summary_model=aai.SummarizationModel.informative, # You can choose different models
    summary_type=aai.SummarizationType.bullets      # 'bullets', 'paragraph', etc.
)

# Create a transcriber object
transcriber = aai.Transcriber()

# Start the transcription
transcript = transcriber.transcribe(AUDIO_FILE_WAV, config)

# Check for transcription errors
if isinstance(transcript, aai.RealtimeTranscript):
    if transcript.error:
        print(f"An error occurred: {transcript.error}")
else:
    processing_time = time.time() - start_time
    print(f"Transcription complete in {processing_time:.2f} seconds.")
    # The transcript object now contains the full text, timestamps, and speaker labels.

Starting transcription process... This may take a moment.
Transcription complete in 16.41 seconds.


In [5]:
# @title Step 5: Final Report
from collections import defaultdict
import operator

print("--- Call Quality Analysis Report ---\n")

# Initialize variables to avoid errors if transcript is empty
speaker_talk_time = defaultdict(float)
speaker_question_count = defaultdict(int)
sales_rep_speaker = "Unknown"
talk_time_ratio = {}
total_question_count = 0
longest_monologue_duration = 0
longest_monologue_speaker = "Unknown"
overall_sentiment_label = "NEUTRAL"
actionable_insight = "Not available."

if transcript.utterances:
    # --- Enhanced Role Identification (Bonus) ---
    first_speaker_label = transcript.utterances[0].speaker

    for utterance in transcript.utterances:
        speaker = utterance.speaker
        duration = utterance.end - utterance.start
        speaker_talk_time[speaker] += duration
        speaker_question_count[speaker] += utterance.text.count('?')

    heuristic_scores = defaultdict(float)
    for speaker, talk_time in speaker_talk_time.items():
        score = talk_time
        score += speaker_question_count[speaker] * 5000
        if speaker == first_speaker_label:
            score += 10000
        heuristic_scores[speaker] = score

    if heuristic_scores:
        sales_rep_speaker = max(heuristic_scores.items(), key=operator.itemgetter(1))[0]

    # --- 1. Calculate and Display Talk-Time Ratio (FIXED) ---
    # The total duration is on the main transcript object
    total_duration_ms = transcript.audio_duration if transcript.audio_duration else 1
    talk_time_ratio = {speaker: (duration / total_duration_ms) * 100 for speaker, duration in speaker_talk_time.items()}

    # --- 2. Count Number of Questions ---
    total_question_count = sum(speaker_question_count.values())

    # --- 3. Find Longest Monologue ---
    for utterance in transcript.utterances:
        duration = (utterance.end - utterance.start) / 1000
        if duration > longest_monologue_duration:
            longest_monologue_duration = duration
            longest_monologue_speaker = utterance.speaker

# --- 4. Analyze Call Sentiment (FIXED) ---
# The 'sentiment' attribute is a simple string, not an enum.
if transcript.sentiment_analysis:
    sentiments = [result.sentiment for result in transcript.sentiment_analysis]

    # Compare against strings "POSITIVE", "NEGATIVE", "NEUTRAL"
    positive_count = sentiments.count("POSITIVE")
    negative_count = sentiments.count("NEGATIVE")
    neutral_count = sentiments.count("NEUTRAL")

    if positive_count > negative_count and positive_count > neutral_count:
        overall_sentiment_label = "POSITIVE"
    elif negative_count > positive_count and negative_count > neutral_count:
        overall_sentiment_label = "NEGATIVE"

# --- 5. Generate Actionable Insight ---
if transcript.summary:
    actionable_insight = transcript.summary


# --- Display Final Report ---
print("1. Talk-Time Ratio:")
if talk_time_ratio:
    for speaker, ratio in talk_time_ratio.items():
        speaker_role = "Sales Rep" if speaker == sales_rep_speaker else "Customer"
        print(f"   - Speaker {speaker} ({speaker_role}): {ratio:.2f}%")
else:
    print("   - No speaker data available.")

print(f"\n2. Number of Questions Asked: {total_question_count}")
print(f"\n3. Longest Monologue: {longest_monologue_duration:.2f} seconds (by Speaker {longest_monologue_speaker})")
print(f"\n4. Overall Call Sentiment: {overall_sentiment_label}")
print(f"\n5. Actionable Insight:\n{actionable_insight}")

print("\n--- End of Report ---")


--- Call Quality Analysis Report ---

1. Talk-Time Ratio:
   - Speaker B (Customer): 30343.10%
   - Speaker A (Sales Rep): 61484.48%

2. Number of Questions Asked: 4

3. Longest Monologue: 15.46 seconds (by Speaker A)

4. Overall Call Sentiment: NEUTRAL

5. Actionable Insight:
- We are a social media production company. It was a quick call to see if you guys are looking for help with social media. Currently we're not looking for anyone, but thank you for the call.
- Have you worked in education? Yes, we've worked with. three companies already. The task with education is a different sector. If you could send me your company profile, I could have a look at it. We can probably hop on a quick call together and hopefully work this out.

--- End of Report ---
