In [None]:
import whisper
import torch
import os
from openai import OpenAI
from dotenv import load_dotenv
import gradio as gr
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')
load_dotenv()
google_api_key= os.getenv("GOOGLE_API_KEY")
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")


In [None]:
class CallInteraction:
    def __init__(self,api_key,base_url,model_name):
        self.audio_model = whisper.load_model("base")
        self.Summary_model = OpenAI(api_key=api_key,base_url=base_url)
        self.system_prompt = """You are a summarization model. Your task is to analyze the following conversation or call transcript and generate a concise summary that captures the most important takeaways, key decisions, action items, and any notable concerns or questions raised during the discussion.

            Guidelines:
            Focus only on relevant and impactful information.
            Do not include small talk or greetings.
            Clearly identify who made key points or decisions (if possible).
            Present the summary in bullet points or short paragraphs for clarity.
            Maintain a professional and objective tone.

            Expected Output:
            Summary of the most important points
            Action items (if any)
            Key decisions made
            Any questions or concerns raised"""
        self.model_name = model_name

    def transcribe(self,audio_path):
        result = self.audio_model.transcribe(audio_path)
        return result["text"]

    def summarize_call(self,audio):
        transcript = self.transcribe(audio)
        message = [{"role":"system","content":self.system_prompt}]+[{"role":"user","content":transcript}]
        response = self.Summary_model.chat.completions.create(messages=message,model=self.model_name)
        summary = response.choices[0].message.content

        return transcript,summary


In [None]:
google_api_key = os.getenv("GOOGLE_API_KEY")
base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
model_name = "gemini-2.0-flash"
call_summarization = CallInteraction(google_api_key,base_url,model_name)
#transcript,summary =call_summarization.summarize_call("Joe Rogan Experience 1598 - The Undertaker.mp3")

In [None]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Call Summarizer: Record or Upload Audio")


    audio_input = gr.Audio(
        sources=["microphone", "upload"],
        type="filepath",
        label="Record or Upload Call"
    )

    transcript_output = gr.Textbox(label="Transcription", lines=8)
    summary_output = gr.Textbox(label="Summary", lines=8)

    summarize_button = gr.Button("Summarize Call")

    summarize_button.click(
        fn=call_summarization.summarize_call,
        inputs=audio_input,
        outputs=[transcript_output, summary_output]
    )

demo.launch()


In [None]:
#

# Seperation

In [None]:
from moviepy import *

def convert_mp4_to_mp3(mp4_path, mp3_path):
    try:
        video = VideoFileClip("interview.mkv")
        audio = video.audio
        audio.write_audiofile(mp3_path)
        print(f"Conversion complete: {mp3_path}")
    except Exception as e:
        print(f"Error: {e}")

# Example usage:
convert_mp4_to_mp3("input_video.mp4", "output_audio.mp3")


# Seperation time

In [None]:
import torch
from pyannote.audio import Pipeline
from functools import partial

# Patch torch.load to use weights_only=False
original_load = torch.load
torch.load = partial(original_load, weights_only=False)

try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=hugging_face_token
    ).to(torch.device("cuda"))
finally:
    # Restore original torch.load
    torch.load = original_load

In [None]:
diarization = pipeline("English Speaking.mp3")

In [None]:
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

In [None]:
import whisper

# Load Whisper
model = whisper.load_model("base").to(torch.device("cuda"))
# Transcribe


In [None]:
result = model.transcribe("BBC.mp3", language="en", verbose=True)
segments = result['segments']

In [None]:
# Testing

# Testing

In [None]:
import torch
from pyannote.audio import Pipeline
from functools import partial
import whisper
from datetime import timedelta

# -- Settings --
AUDIO_FILE = "AudioFiles/English Speaking.mp3"
DEVICE = torch.device("cuda")

# Patch for pyannote
original_load = torch.load
torch.load = partial(original_load, weights_only=False)

# Load diarization pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=hugging_face_token
).to(DEVICE)

# Restore original torch.load
torch.load = original_load

# Run diarization
diarization = pipeline(AUDIO_FILE)

# Convert diarization into list of dicts
speaker_segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    speaker_segments.append({
        "start": turn.start,
        "end": turn.end,
        "speaker": speaker
    })

# Load Whisper model
model = whisper.load_model("base").to(DEVICE)

# Transcribe
result = model.transcribe(AUDIO_FILE, language="en", verbose=False)
segments = result["segments"]

# Match each Whisper segment to a speaker
final_output = []

def find_speaker(start_time, end_time, speaker_segments):
    for segment in speaker_segments:
        # If the midpoint of the whisper segment lies inside a diarization segment
        mid_point = (start_time + end_time) / 2
        if segment["start"] <= mid_point <= segment["end"]:
            return segment["speaker"]
    return "UNKNOWN"

for seg in segments:
    speaker = find_speaker(seg['start'], seg['end'], speaker_segments)
    text = seg['text'].strip()
    start_time = str(timedelta(seconds=int(seg['start'])))
    end_time = str(timedelta(seconds=int(seg['end'])))
    final_output.append({
        "speaker": speaker,
        "start": start_time,
        "end": end_time,
        "text": text
    })

# Print output
# -- Format the transcript in a way that groups text by each speaker --
speaker_map = {}
speaker_counter = 1

# Prepare for grouped speaker text
grouped_transcript = []

current_speaker = None
accumulated_text = []

for entry in final_output:
    speaker_raw = entry["speaker"]

    # Map speakers
    if speaker_raw not in speaker_map:
        speaker_map[speaker_raw] = f"Speaker {speaker_counter}"
        speaker_counter += 1
    speaker_label = speaker_map[speaker_raw]

    # If speaker changes, save previous accumulated text
    if speaker_label != current_speaker:
        if current_speaker is not None:  # Only save if it's not the first entry
            grouped_transcript.append(f"{current_speaker}: {' '.join(accumulated_text)}")
        current_speaker = speaker_label
        accumulated_text = [entry['text']]  # Start accumulating new speaker's text
    else:
        accumulated_text.append(entry['text'])

# Don't forget to append the last speaker's text
if accumulated_text:
    grouped_transcript.append(f"{current_speaker}: {' '.join(accumulated_text)}")

# Join everything into the final output format
summarization_input = "\n\n".join(grouped_transcript)

# Print the result
print(summarization_input)

# Optional: save to a text file
# with open("formatted_transcript_for_llm.txt", "w", encoding="utf-8") as f:
#     f.write(summarization_input)


In [None]:
print(final_output)

# OOP

In [58]:
import torch
import gradio as gr
from pyannote.audio import Pipeline
from functools import partial
import whisper
from datetime import timedelta

class SpeakerDiarization:
    def __init__(self, audio_file, device, hugging_face_token):
        self.audio_file = audio_file
        self.device = device
        self.hugging_face_token = hugging_face_token
        self.pipeline = self._load_diarization_pipeline()

    def _load_diarization_pipeline(self):
        # Patch for pyannote
        original_load = torch.load
        torch.load = partial(original_load, weights_only=False)

        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=self.hugging_face_token
        ).to(self.device)

        # Restore original torch.load
        torch.load = original_load

        return pipeline

    def diarize(self):
        diarization = self.pipeline(self.audio_file)
        speaker_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speaker_segments.append({
                "start": turn.start,
                "end": turn.end,
                "speaker": speaker
            })
        return speaker_segments

class SpeechToText:
    def __init__(self, audio_file, device):
        self.audio_file = audio_file
        self.device = device
        self.model = whisper.load_model("base").to(self.device)

    def transcribe(self):
        result = self.model.transcribe(self.audio_file, language="en", verbose=False)
        return result["segments"]

class SpeakerTextMapper:
    def __init__(self, speaker_segments, transcribed_segments):
        self.speaker_segments = speaker_segments
        self.transcribed_segments = transcribed_segments

    def find_speaker(self, start_time, end_time):
        for segment in self.speaker_segments:
            # If the midpoint of the whisper segment lies inside a diarization segment
            mid_point = (start_time + end_time) / 2
            if segment["start"] <= mid_point <= segment["end"]:
                return segment["speaker"]
        return "UNKNOWN"

    def map_speakers(self):
        final_output = []
        for seg in self.transcribed_segments:
            speaker = self.find_speaker(seg['start'], seg['end'])
            text = seg['text'].strip()
            start_time = str(timedelta(seconds=int(seg['start'])))
            end_time = str(timedelta(seconds=int(seg['end'])))
            final_output.append({
                "speaker": speaker,
                "start": start_time,
                "end": end_time,
                "text": text
            })
        return final_output

class TranscriptFormatter:
    def __init__(self, final_output):
        self.final_output = final_output

    def format_transcript(self):
        # Map for speaker names
        speaker_map = {}
        speaker_counter = 1

        # Prepare for grouped speaker text
        grouped_transcript = []

        current_speaker = None
        accumulated_text = []

        for entry in self.final_output:
            speaker_raw = entry["speaker"]

            # Map speakers
            if speaker_raw not in speaker_map:
                speaker_map[speaker_raw] = f"Speaker {speaker_counter}"
                speaker_counter += 1
            speaker_label = speaker_map[speaker_raw]

            # If speaker changes, save previous accumulated text
            if speaker_label != current_speaker:
                if current_speaker is not None:  # Only save if it's not the first entry
                    grouped_transcript.append(f"{current_speaker}: {' '.join(accumulated_text)}")
                current_speaker = speaker_label
                accumulated_text = [entry['text']]  # Start accumulating new speaker's text
            else:
                accumulated_text.append(entry['text'])

        # Don't forget to append the last speaker's text
        if accumulated_text:
            grouped_transcript.append(f"{current_speaker}: {' '.join(accumulated_text)}")

        # Join everything into the final output format
        return "\n\n".join(grouped_transcript)

# class AudioProcessor:
#     def __init__(self, audio_file, device, hugging_face_token):
#         self.audio_file = audio_file
#         self.device = device
#         self.hugging_face_token = hugging_face_token
#
#     def process_audio(self):
#         # Diarization
#         diarization = SpeakerDiarization(self.audio_file, self.device, self.hugging_face_token)
#         speaker_segments = diarization.diarize()
#
#         # Transcription
#         speech_to_text = SpeechToText(self.audio_file, self.device)
#         transcribed_segments = speech_to_text.transcribe()
#
#         # Map speakers to transcribed segments
#         speaker_text_mapper = SpeakerTextMapper(speaker_segments, transcribed_segments)
#         final_output = speaker_text_mapper.map_speakers()
#
#         # Format the transcript
#         transcript_formatter = TranscriptFormatter(final_output)
#         Full_transcript = transcript_formatter.format_transcript()
#
#         return Full_transcript

class SummaryEvaluator:
    def __init__(self, api_key: str, base_url: str, model_name: str):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model_name = model_name
        self.system_prompt = self._build_system_prompt()

    def _build_system_prompt(self) -> str:
        return (
            "You are an expert assistant for evaluating the quality of generated summaries from audio or meeting transcripts. "
            "Below is the original transcript of a conversation, followed by a summary generated by a model. Your task is to "
            "objectively evaluate the summary based on the following criteria:\n\n"
            "Evaluation Criteria:\n\n"
            "Coverage: Does the summary capture the most important points, decisions, and action items from the transcript?\n\n"
            "Faithfulness: Is the summary factually accurate and consistent with the content of the transcript (i.e., no hallucinations or distortions)?\n\n"
            "Clarity: Is the summary easy to read, well-organized, and written in a professional tone?\n\n"
            "Conciseness: Is the summary concise without omitting critical information?\n\n"
            "Speaker Attribution (optional): Where relevant, does the summary correctly attribute key statements or decisions to the right speaker?\n\n"
            "With these criteria, please evaluate the summary, replying with whether the summary is acceptable and your feedback."
        )

    def _build_user_prompt(self, transcript: str, summary: str) -> str:
        return (
            f"Here is the full transcript of the call:\n\n{transcript}\n\n"
            f"Here is the summary of the call:\n\n{summary}\n\n"
            "Please evaluate the summary, replying with whether it is acceptable and your feedback."
        )

    def evaluate(self, transcript: str, summary: str) -> Evaluation:
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": self._build_user_prompt(transcript, summary)}
        ]

        response = self.client.beta.chat.completions.parse(
            messages=messages,
            model=self.model_name,
            response_format=Evaluation
        )

        return response.choices[0].message.parsed

class CallInteraction:
    def __init__(
        self,
        audio_file,
        device,
        hugging_face_token,
        api_key,
        base_url,
        model_name,
        prompt_template=None,
        evaluation_api_key=None,
        evaluation_base_url=None,
        evaluation_model_name=None
    ):
        self.audio_file = audio_file
        self.device = device
        self.hugging_face_token = hugging_face_token
        self.Summary_model = OpenAI(api_key=api_key,base_url=base_url)
        self.system_prompt = prompt_template or self.default_prompt()
        self.model_name = model_name

        self.evaluator = SummaryEvaluator(
            api_key=evaluation_api_key or api_key,
            base_url=evaluation_base_url or base_url,
            model_name=evaluation_model_name or model_name
        )



    def default_prompt(self):
        return (
            "You are a summarization model. Your task is to analyze the following conversation "
            "or call transcript and generate a concise summary that captures the most important takeaways, "
            "key decisions, action items, and any notable concerns or questions raised during the discussion.\n\n"
            "Guidelines:\n"
            "- Focus only on relevant and impactful information.\n"
            "- Do not include small talk or greetings.\n"
            "- Clearly identify who made key points or decisions (if possible).\n"
            "- Present the summary in bullet points or short paragraphs for clarity.\n"
            "- Maintain a professional and objective tone.\n\n"
            "Expected Output:\n"
            "- Summary of the most important points\n"
            "- Action items (if any)\n"
            "- Key decisions made\n"
            "- Any questions or concerns raised\n\n"
            "Transcript:\n\n"
        )

    def updated_system_prompt(self,transcript,summary,feedback):
        updated_prompt = self.system_prompt + f"\n\n## Previous summary was rejected\nYou just tried to reply, but the quality control rejected your summary\n"
        updated_prompt += f"## Your attempted summary:\n{summary}\n\n"
        updated_prompt+=f"## Reason for rejection:\n{feedback}\n\n"
        return updated_prompt

    def summarize_call(self):
        diarization = SpeakerDiarization(self.audio_file, self.device, self.hugging_face_token)
        speaker_segments = diarization.diarize()

        # Transcription
        speech_to_text = SpeechToText(self.audio_file, self.device)
        transcribed_segments = speech_to_text.transcribe()

        # Map speakers to transcribed segments
        speaker_text_mapper = SpeakerTextMapper(speaker_segments, transcribed_segments)
        final_output = speaker_text_mapper.map_speakers()

        # Format the transcript
        transcript_formatter = TranscriptFormatter(final_output)
        full_transcript = transcript_formatter.format_transcript()

        message = [{"role":"system","content":self.system_prompt}]+[{"role":"user","content":full_transcript}]
        response = self.Summary_model.chat.completions.create(messages=message,model=self.model_name)
        summary = response.choices[0].message.content

        evaluation = self.evaluator.evaluate(full_transcript, summary)

        if not evaluation.is_acceptable:
            # Retry with updated prompt
            updated_prompt = self.updated_system_prompt(full_transcript, summary, evaluation.feedback)
            retry_messages = [
                {"role": "system", "content": updated_prompt},
                {"role": "user", "content": full_transcript}
            ]
            retry_response = self.Summary_model.chat.completions.create(messages=retry_messages, model=self.model_name)
            summary = retry_response.choices[0].message.content
            evaluation = self.evaluator.evaluate(full_transcript, summary)

        return full_transcript, summary, evaluation.feedback



In [59]:
google_api_key = os.getenv("GOOGLE_API_KEY")
base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
model_name = "gemini-2.0-flash"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def process_uploaded_audio(audio_file):
    interaction = CallInteraction(
        audio_file=audio_file,
        device=device,
        hugging_face_token=hugging_face_token,
        api_key=google_api_key,
        base_url=base_url,
        model_name=model_name
    )
    return interaction.summarize_call()

with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ Call Summarizer: Record or Upload Audio")

    audio_input = gr.Audio(
        sources=["microphone", "upload"],
        type="filepath",
        label="Record or Upload Call"
    )

    transcript_output = gr.Textbox(label="Transcription", lines=8)
    summary_output = gr.Textbox(label="Summary", lines=8)
    feedback_output = gr.Textbox(label="LLM Evaluation Feedback", lines=5)

    summarize_button = gr.Button("Summarize Call")

    summarize_button.click(
        fn=process_uploaded_audio,
        inputs=audio_input,
        outputs=[transcript_output, summary_output, feedback_output]
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7866
* To create a public link, set `share=True` in `launch()`.


