<a href="https://colab.research.google.com/github/AvniChauhan03/CallQualityAnalyzer/blob/main/CallQualityAnalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip -q install yt-dlp youtube-transcript-api webrtcvad python_speech_features nltk openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.0/485.0 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone


In [2]:
!pip install python_speech_features




In [3]:
from pydub import AudioSegment

# put your downloaded file here
video_file = "Sales Call example 1.mp4"
wav_file = "audio_16k.wav"

# it Converts to WAV, 16kHz, mono
audio = AudioSegment.from_file(video_file)
audio = audio.set_frame_rate(16000).set_channels(1)
audio.export(wav_file, format="wav")

print("Audio saved as:", wav_file)


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Audio saved as: audio_16k.wav


In [5]:
import numpy as np
from pydub import AudioSegment
from dataclasses import dataclass
from sklearn.cluster import KMeans
import wave
import contextlib
import webrtcvad

@dataclass
class Segment:
    start: float
    end: float
    speaker: int

def diarize(wavfile: str, n_speakers: int = 2):
    vad = webrtcvad.Vad(2)
    with contextlib.closing(wave.open(wavfile,'rb')) as wf:
        sample_rate = wf.getframerate()
        pcm = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)

    window_size = int(0.03 * sample_rate)
    hop = int(0.01 * sample_rate)
    windows = [pcm[i:i+window_size].tobytes() for i in range(0, len(pcm)-window_size, hop)]
    voiced = [vad.is_speech(w, sample_rate) for w in windows]
    times = [i*hop/sample_rate for i in range(len(voiced))]

    labels = KMeans(n_clusters=n_speakers, random_state=0).fit_predict(np.array(voiced).reshape(-1,1))

    segs, cur, start = [], labels[0], times[0]
    for t, lab in zip(times, labels):
        if lab != cur:
            segs.append(Segment(start=start, end=t, speaker=int(cur)))
            cur, start = lab, t
    segs.append(Segment(start=start, end=times[-1]+0.5, speaker=int(cur)))
    return segs

def analyze_audio(wavfile: str):
    segs = diarize(wavfile)
    if not segs: return {}

    # calculate Talk-time %
    talk_time = {}
    for s in segs:
        talk_time[s.speaker] = talk_time.get(s.speaker,0) + (s.end - s.start)
    total = sum(talk_time.values())
    talk_ratio = {spk: round(100*t/total,1) for spk,t in talk_time.items()}

    # it finds out Longest monologue
    longest = max((s.end-s.start for s in segs))

    # it finds out approximate questions per speaker: short segments <1.5s
    questions = {s.speaker:0 for s in segs}
    for s in segs:
        if s.end - s.start <= 1.5:
            questions[s.speaker] += 1

    # finds out approximate sentiment from the audio energy
    audio = AudioSegment.from_file(wavfile)
    samples = np.array(audio.get_array_of_samples())
    energy = np.mean(np.abs(samples))
    if energy > 1000: sentiment = "Positive"
    elif energy < 300: sentiment = "Negative"
    else: sentiment = "Neutral"

    # Actionable insight
    if max(talk_ratio.values()) > 80: insight = "Conversation dominated by one speaker"
    elif sum(questions.values()) < 3: insight = "Try asking more questions"
    else: insight = "Balanced conversation"

    # Roles approximation: speaker with more talk-time = Sales
    roles = {}
    spk_sorted = sorted(talk_time.items(), key=lambda x:x[1], reverse=True)
    roles[spk_sorted[0][0]] = "Sales Rep"
    roles[spk_sorted[1][0]] = "Customer" if len(spk_sorted)>1 else "Unknown"

    return {
        "Talk-time %": talk_ratio,
        "Questions per speaker": questions,
        "Longest monologue (s)": round(longest,1),
        "Call sentiment": sentiment,
        "Actionable insight": insight,
        "Roles": roles
    }

# Example usage
report = analyze_audio("audio_16k.wav")
print(report)


{'Talk-time %': {0: 16.1, 1: 83.9}, 'Questions per speaker': {0: 145, 1: 126}, 'Longest monologue (s)': 6.8, 'Call sentiment': 'Positive', 'Actionable insight': 'Conversation dominated by one speaker', 'Roles': {1: 'Sales Rep', 0: 'Customer'}}


In [6]:
segments = diarize("audio_16k.wav", n_speakers=2)

for s in segments:
    print(f"Speaker {s.speaker}: {round(s.start,1)}s → {round(s.end,1)}s")


Speaker 0: 0.0s → 6.8s
Speaker 1: 6.8s → 9.8s
Speaker 0: 9.8s → 9.8s
Speaker 1: 9.8s → 10.1s
Speaker 0: 10.1s → 10.4s
Speaker 1: 10.4s → 10.7s
Speaker 0: 10.7s → 10.7s
Speaker 1: 10.7s → 11.1s
Speaker 0: 11.1s → 11.2s
Speaker 1: 11.2s → 11.2s
Speaker 0: 11.2s → 11.2s
Speaker 1: 11.2s → 13.0s
Speaker 0: 13.0s → 13.5s
Speaker 1: 13.5s → 14.8s
Speaker 0: 14.8s → 14.9s
Speaker 1: 14.9s → 15.2s
Speaker 0: 15.2s → 15.2s
Speaker 1: 15.2s → 18.2s
Speaker 0: 18.2s → 18.3s
Speaker 1: 18.3s → 19.1s
Speaker 0: 19.1s → 19.1s
Speaker 1: 19.1s → 20.3s
Speaker 0: 20.3s → 20.5s
Speaker 1: 20.5s → 20.6s
Speaker 0: 20.6s → 20.6s
Speaker 1: 20.6s → 21.4s
Speaker 0: 21.4s → 21.5s
Speaker 1: 21.5s → 21.9s
Speaker 0: 21.9s → 21.9s
Speaker 1: 21.9s → 23.1s
Speaker 0: 23.1s → 23.1s
Speaker 1: 23.1s → 23.9s
Speaker 0: 23.9s → 24.0s
Speaker 1: 24.0s → 25.6s
Speaker 0: 25.6s → 25.6s
Speaker 1: 25.6s → 26.4s
Speaker 0: 26.4s → 26.6s
Speaker 1: 26.6s → 30.4s
Speaker 0: 30.4s → 30.5s
Speaker 1: 30.5s → 30.6s
Speaker

In [7]:
# Talk time per speaker
talk_time = {}
for s in segments:
    talk_time[s.speaker] = talk_time.get(s.speaker, 0) + (s.end - s.start)

total = sum(talk_time.values())
ratio = {spk: round(100 * t / total, 1) for spk, t in talk_time.items()}

# Longest monologue
longest = max(s.end - s.start for s in segments)

print("Talk-time % per speaker:", ratio)
print("Longest monologue (s):", round(longest,1))


Talk-time % per speaker: {0: 16.1, 1: 83.9}
Longest monologue (s): 6.8


In [8]:
wav_file = "audio_16k.wav"
report = analyze_audio(wav_file)
print(report)


{'Talk-time %': {0: 16.1, 1: 83.9}, 'Questions per speaker': {0: 145, 1: 126}, 'Longest monologue (s)': 6.8, 'Call sentiment': 'Positive', 'Actionable insight': 'Conversation dominated by one speaker', 'Roles': {1: 'Sales Rep', 0: 'Customer'}}
