<a href="https://colab.research.google.com/github/Alex-Zeo/DMO-Meeting-Notes/blob/main/DMO_Whisper_Large_Meeting_Notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transcribe Youtube Videos
This script automates transcribing Youtube videos by trying two approaches:

1) First it checks if Youtube has a transcript file available for download

2) Next it downloads the video audio and uses OpenAi Whisper model to generate a transcript from the audio

This script will save both transcripts (if one exists on youtube), and logs the results.

In [1]:
# Install required packages
!pip install --quiet youtube_transcript_api
!pip install --quiet yt_dlp
!pip install --quiet whisper-openai
!apt-get update && apt-get install -y ffmpeg

# URL of the video to transcribe
video_url = "https://youtu.be/6qO1cHnJTz8"

import os
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
import whisper
import re
from datetime import timedelta
import time
import json

def get_video_id(url):
    """Extract video ID from YouTube URL"""
    print("Extracting video ID from URL...")
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:be\/)([0-9A-Za-z_-]{11}).*'
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            video_id = match.group(1)
            print(f"Video ID extracted: {video_id}")
            return video_id
    print("Failed to extract video ID.")
    return None

def download_audio(video_url, output_path='audio.mp3'):
    """Download audio from YouTube video"""
    print("Downloading audio...")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': output_path.replace('.mp3', ''),
        # Add rate limiting
        'socket_timeout': 30,
        'retries': 10,
        'fragment_retries': 10,
        'retry_sleep_functions': {'fragment': lambda n: 5 * (n + 1)},
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
        print("Audio downloaded successfully.")
        return True
    except Exception as e:
        print(f"Error downloading audio: {str(e)}")
        return False

def format_timestamp(seconds):
    """Convert seconds to timestamp format"""
    td = timedelta(seconds=float(seconds))
    hours = td.seconds // 3600
    minutes = (td.seconds % 3600) // 60
    seconds = td.seconds % 60
    if hours > 0:
        return f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
    return f"[{minutes:02d}:{seconds:02d}]"

def get_youtube_transcript(video_id):
    """Get transcript from YouTube"""
    try:
        print("Checking available transcript languages...")
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        available_languages = [tr.language_code for tr in transcript_list._transcripts.values()]
        print("Available languages:", available_languages)

        print("Attempting to retrieve English transcript...")
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        print("English transcript retrieved successfully from YouTube.")

        formatted_transcript = []
        for entry in transcript:
            timestamp = format_timestamp(entry['start'])
            formatted_transcript.append(f"{timestamp} {entry['text']}")

        return "\n".join(formatted_transcript)
    except Exception as e:
        print(f"Error getting YouTube transcript: {str(e)}")
        return None

def generate_whisper_transcript(audio_path):
    """Generate transcript using Whisper"""
    try:
        print("Loading Whisper large model...")
        model = whisper.load_model("large")

        print("Transcribing audio with Whisper...")
        # Use chunks to handle long audio
        result = model.transcribe(
            audio_path,
            task="transcribe",
            language="en",
            initial_prompt="This is an English transcript.",  # Helps with English recognition
            verbose=True
        )

        formatted_transcript = []
        for segment in result["segments"]:
            timestamp = format_timestamp(segment["start"])
            formatted_transcript.append(f"{timestamp} {segment['text']}")

        return "\n".join(formatted_transcript)
    except Exception as e:
        print(f"Error in Whisper transcription: {str(e)}")
        return None

def save_transcript(transcript, filename):
    """Save transcript to file"""
    if transcript:
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(transcript)
            print(f"Transcript saved to {filename}")
            return True
        except Exception as e:
            print(f"Error saving transcript: {str(e)}")
    return False

def process_video(video_url):
    """Process video and generate both types of transcripts"""
    print("Starting video processing...")

    # Get video ID
    video_id = get_video_id(video_url)
    if not video_id:
        return "Invalid YouTube URL"

    results = {
        "youtube_transcript": None,
        "whisper_transcript": None
    }

    # Get YouTube transcript
    print("\n=== Getting YouTube Transcript ===")
    yt_transcript = get_youtube_transcript(video_id)
    if yt_transcript:
        save_transcript(yt_transcript, 'YTtranscript_en.txt')
        results["youtube_transcript"] = yt_transcript

    # Generate Whisper transcript
    print("\n=== Generating Whisper Transcript ===")
    if download_audio(video_url):
        whisper_transcript = generate_whisper_transcript('audio.mp3')
        if whisper_transcript:
            save_transcript(whisper_transcript, 'Whisper_transcript_en.txt')
            results["whisper_transcript"] = whisper_transcript

        # Clean up audio file
        if os.path.exists("audio.mp3"):
            os.remove("audio.mp3")
            print("Cleaned up audio file")

    # Save combined results to JSON for comparison
    with open('transcripts_info.json', 'w', encoding='utf-8') as f:
        json.dump({
            "video_id": video_id,
            "youtube_transcript_available": bool(results["youtube_transcript"]),
            "whisper_transcript_available": bool(results["whisper_transcript"]),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
        }, f, indent=4)

    return results

# Process video and generate both transcripts
print("Starting script...")
results = process_video(video_url)

# Print summary
print("\n=== Processing Complete ===")
print("YouTube Transcript:", "Generated" if results["youtube_transcript"] else "Failed")
print("Whisper Transcript:", "Generated" if results["whisper_transcript"] else "Failed")
print("\nCheck YTtranscript_en.txt and Whisper_transcript_en.txt for the results.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [6

100%|█████████████████████████████████████| 2.87G/2.87G [01:02<00:00, 49.5MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcribing audio with Whisper...




[00:00.000 --> 00:16.000]  Los Angeles is doing a $1000 gift card giveaway for a two night stay at a Fairmont.
[00:16.000 --> 00:24.000]  If you sign up for their, you get put into a raffle for their newsletter.
[00:24.000 --> 00:32.000]  Wow, that sounds a little... $1000 raffle.
[00:32.000 --> 00:34.000]  For a two night stay.
[00:34.000 --> 00:44.000]  We used to do sweepstakes to generate email subscriptions.
[00:44.000 --> 01:02.000]  And that was the way we did it for many, many years. And it led to almost 200,000 names in our database.
[01:02.000 --> 01:16.000]  And then we stopped doing it and when we converted over to MailChimp, we had to purge over 80,000 of them.
[01:16.000 --> 01:20.000]  The problem with these sweepstakes is that you don't get quality signups.
[01:20.000 --> 01:26.000]  Yeah. Like I just signed up for it, I'll probably never...
[01:26.000 --> 01:31.000]  Exactly. The last thing we need is more Andrew Clarke signing up.
[01:31.000 --> 01:34.000]  Yep. They'

#Meeting Note Generator


In [18]:
# Install necessary packages
!pip install -q openai tiktoken pandas requests python-docx

# Mount Google Drive
print("Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# Define folders
transcripts_folder = '/content/'  # Update this path if your transcripts are in a different location
meeting_notes_folder = '/content/drive/MyDrive/DMO/'  # Update this path as needed

# Import necessary libraries
import os
import math
import pandas as pd
import logging
from typing import List, Dict
import tiktoken
import json
import requests
import re
from datetime import timedelta, datetime  # Added datetime for dynamic date
import time
from docx import Document  # For creating Word documents
import getpass  # For secure API key input

# Set up logger for debugging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Model configuration
MODEL_NAME = 'gpt-4o-mini'  # Ensure this is the correct model name
MAX_MODEL_TOKENS = 128000
MAX_RESPONSE_TOKENS = 10000

# Retrieve API key
from google.colab import userdata
api_key = userdata.get('api_key')
if not api_key:
    api_key = input("Please enter your OpenAI API key: ")

# Client Wrapper Class
class ClientWrapper:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.endpoint = "https://api.openai.com/v1/chat/completions"

    def create_chat_completion(self, model: str, messages: List[Dict], temperature: float, max_tokens: int):
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        response = requests.post(self.endpoint, headers=headers, json=payload)
        if response.status_code != 200:
            raise Exception(
                f"OpenAI API request failed: {response.status_code} {response.text}"
            )
        return response.json()

client = ClientWrapper(api_key=api_key)

# Token Counting Function
def num_tokens_from_messages(messages: List[Dict], model: str) -> int:
    """Count the tokens of given messages using tiktoken."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback encoding if the model is not recognized
        encoding = tiktoken.get_encoding("cl100k_base")

    if model.startswith("gpt-4"):
        tokens_per_message = 3
        tokens_per_name = 1
    elif model.startswith("gpt-3.5-turbo"):
        tokens_per_message = 4
        tokens_per_name = -1
    else:
        tokens_per_message = 3
        tokens_per_name = 1

    total_tokens = 0
    for message in messages:
        total_tokens += tokens_per_message
        for key, value in message.items():
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name
    total_tokens += 2
    return total_tokens

# Content Chunking Function
def chunk_text_content(text_content: str, max_chunk_tokens: int, model: str) -> List[str]:
    """Chunk text content into pieces that fit within the token limit."""
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text_content)
    total_tokens = len(tokens)
    logger.debug(f"Total tokens in text content: {total_tokens}")
    chunks = []
    start_idx = 0
    while start_idx < total_tokens:
        end_idx = min(start_idx + max_chunk_tokens, total_tokens)
        chunk_tokens = tokens[start_idx:end_idx]
        chunk = encoding.decode(chunk_tokens)
        chunks.append(chunk)
        start_idx = end_idx
    return chunks

# Mappings Combination Function (Unused in this context)
def combine_mappings(mappings_list: List[Dict]) -> Dict:
    """Combine multiple mappings into one."""
    combined_mapping = {}
    for mapping in mappings_list:
        for key, value in mapping.items():
            if key == 'content_type':
                combined_mapping[key] = value
            else:
                if isinstance(value, dict) and isinstance(combined_mapping.get(key), dict):
                    # Merge nested dicts
                    for sub_key, sub_val in value.items():
                        if (sub_key not in combined_mapping[key] or
                            not combined_mapping[key][sub_key] or
                            combined_mapping[key][sub_key] == "not found"):
                            combined_mapping[key][sub_key] = sub_val
                else:
                    if (key not in combined_mapping or
                        not combined_mapping[key] or
                        combined_mapping[key] == "not found"):
                        combined_mapping[key] = value
    return combined_mapping

# System Prompt for Meeting Notes Generation
system_prompt = """
You are an advanced AI assistant specialized in processing meeting transcripts to generate comprehensive and structured meeting notes. Your primary task is to convert raw transcript text from weekly marketing meetings of a destination marketing organization into organized meeting notes. These meetings involve team members discussing their current projects, schedules, agency collaborations, and other relevant topics.

**Requirements for the Meeting Notes:**

1. **Project Updates**
   - **Project Name:** Clearly identify and state the name of each project being discussed to ensure all team members are aligned on which initiatives are being referenced.
   - **Updates:** Provide a concise summary of the latest developments, progress, or changes related to each project. Include any new strategies implemented, milestones achieved, or adjustments made to project plans.
   - **Milestones Tracking:** Highlight if any significant milestones that have been reached since the last meeting. This could include completed phases, successful launches, or noteworthy accomplishments that indicate project advancement. Outline the next set of milestones or actions required to move the project forward. Specify expected dates, responsible team members, and any preparatory steps needed to achieve these upcoming goals. If none were discussed simply enter 'Not Discussed' in this section.

2. **To-Dos**
   - **Action Items:** List specific tasks or actions that need to be completed. Clearly assign each task to the responsible team member or department to ensure accountability and ownership.
   - **Teamwork Dependencies:** Identify any dependencies between tasks, team members, or departments that are necessary to accomplish the action items effectively. Highlight how different parts of the team need to coordinate to achieve these tasks. If none were mentioned simply enter 'None Discussed'
   - **Deadlines and Timeframes:** Specify deadlines or target dates for each action item to ensure timely completion. Include any interim deadlines for sub-tasks if applicable to maintain progress tracking. If none were mentioned simply enter 'Not Discussed'.

**Formatting Guidelines:**

- **Title:** Begin with a clear title that includes the meeting name and date. For example: "Marketing Standup"

- **Sections:** Organize the notes into the relevant main sections: Project Updates and To-Dos using bold headings for each section.

- **Project Details:** Under Project Updates, list each project as a subheading followed by bullet points detailing the updates.

- **Action Items:** Under To-Dos, list each action item with clear attribution to the responsible team member. Indicate any dependencies, collaborations, and deadlines mentioned.

**Processing Instructions:**

1. **Input:** You will receive transcript text data with timestamps from a weekly marketing meeting.

2. **Extraction:**
   - Identify and extract discussions related to project updates, ensuring each project is clearly named and its updates are succinctly summarized.
   - Determine action items and assign them to the appropriate team members, noting any dependencies or required collaborations.
   - Summarize the overall meeting and list any meetings scheduled for the upcoming week with their respective times.

3. **Output:** Generate the meeting notes following the structure and formatting guidelines provided above. Ensure clarity, conciseness, and organization to facilitate easy reference and action by team members.

**Additional Notes:**

- Maintain consistency in formatting to ensure readability.
- Use bullet points for clarity and brevity.
- Ensure all relevant information from the transcript is captured accurately in the meeting notes.
- Omit any irrelevant or redundant information that does not pertain to the three main sections.

By adhering to these guidelines, you will produce effective and actionable meeting notes that enhance team collaboration and project management.
"""

# User Prompt Template
prompt_user_template = """
Analyze this transcript and generate the meeting notes as specified:

<Transcript Here>
"""

# Function to Generate Meeting Notes from Transcript
def generate_meeting_notes(transcript: str, meeting_date: str, meeting_name: str = "Marketing Standup") -> str:
    """Generate structured meeting notes from a transcript."""
    system_msg = system_prompt
    user_msg = prompt_user_template.replace("<Transcript Here>", transcript)

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]

    total_tokens = num_tokens_from_messages(messages, MODEL_NAME)
    max_prompt_tokens = MAX_MODEL_TOKENS - MAX_RESPONSE_TOKENS

    if total_tokens > max_prompt_tokens:
        logger.debug(f"Transcript exceeds token limit. Chunking necessary.")
        max_chunk_tokens = max_prompt_tokens // 2  # Adjust as needed
        chunks = chunk_text_content(transcript, max_chunk_tokens, MODEL_NAME)
    else:
        chunks = [transcript]

    meeting_notes_sections = []

    for i, chunk in enumerate(chunks):
        logger.debug(f"Processing chunk {i+1}/{len(chunks)}")
        current_user_msg = prompt_user_template.replace("<Transcript Here>", chunk)
        current_messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": current_user_msg}
        ]
        try:
            response = client.create_chat_completion(
                model=MODEL_NAME,
                messages=current_messages,
                temperature=0.3,
                max_tokens=MAX_RESPONSE_TOKENS
            )
            content = response['choices'][0]['message']['content'].strip()
            meeting_notes_sections.append(content)
        except Exception as e:
            logger.error(f"Error generating meeting notes for chunk {i+1}: {e}")
            meeting_notes_sections.append(f"Error processing chunk {i+1}")

    # Combine all sections
    final_meeting_notes = "\n\n".join(meeting_notes_sections)

    # Add Title
    final_meeting_notes = f"{meeting_name} - {meeting_date}\n\n" + final_meeting_notes

    return final_meeting_notes

# Function to Load Existing Transcript
def load_existing_transcript(transcripts_folder: str) -> str:
    """Load the existing transcript from the transcripts folder."""
    yt_transcript_path = os.path.join(transcripts_folder, 'YTtranscript_en.txt')
    whisper_transcript_path = os.path.join(transcripts_folder, 'Whisper_transcript_en.txt')

    if os.path.exists(yt_transcript_path):
        print("Loading YouTube transcript...")
        with open(yt_transcript_path, 'r', encoding='utf-8') as f:
            transcript = f.read()
        return transcript
    elif os.path.exists(whisper_transcript_path):
        print("Loading Whisper transcript...")
        with open(whisper_transcript_path, 'r', encoding='utf-8') as f:
            transcript = f.read()
        return transcript
    else:
        raise FileNotFoundError("No transcript file found in the transcripts folder.")

# Function to Save Meeting Notes as a Word Document
def save_meeting_notes_as_word(meeting_notes: str, meeting_notes_path: str):
    """Save the meeting notes as a Word document."""
    try:
        document = Document()
        lines = meeting_notes.split('\n')
        for line in lines:
            stripped_line = line.strip()
            if stripped_line.startswith("**") and stripped_line.endswith("**"):
                # Add as a heading
                heading_text = stripped_line.strip("*")
                document.add_heading(heading_text, level=1)
            elif stripped_line.startswith("*") and stripped_line.endswith("*"):
                # Add as a subheading
                subheading_text = stripped_line.strip("*")
                document.add_heading(subheading_text, level=2)
            else:
                # Add as a paragraph
                document.add_paragraph(line)
        document.save(meeting_notes_path)
        print(f"Meeting notes saved to {meeting_notes_path}")
    except Exception as e:
        print(f"Error saving meeting notes as Word document: {str(e)}")

# Main Processing Function
def main():
    # Dynamically assign today's date in MM/DD/YYYY format
    meeting_date = datetime.now().strftime("%m/%d/%Y")

    try:
        # Load existing transcript
        print("Loading existing transcript...")
        transcript = load_existing_transcript(transcripts_folder)
    except FileNotFoundError as e:
        print(str(e))
        return
    except Exception as e:
        print(f"An error occurred while loading the transcript: {str(e)}")
        return

    # Generate meeting notes
    print("\n=== Generating Meeting Notes ===")
    meeting_notes = generate_meeting_notes(transcript, meeting_date)

    # Define meeting notes filename
    meeting_notes_filename = f"Marketing_Standup_{meeting_date.replace('/', '-')}.docx"
    meeting_notes_path = os.path.join(meeting_notes_folder, meeting_notes_filename)

    # Save meeting notes as a Word document
    save_meeting_notes_as_word(meeting_notes, meeting_notes_path)

    # Optionally, display the meeting notes
    print("\n=== Generated Meeting Notes ===")
    print(meeting_notes)

if __name__ == "__main__":
    main()


Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
Loading existing transcript...
Loading Whisper transcript...

=== Generating Meeting Notes ===
Meeting notes saved to /content/drive/MyDrive/DMO/Marketing_Standup_12-16-2024.docx

=== Generated Meeting Notes ===
Marketing Standup - 12/16/2024

# Marketing Standup - [Date Not Specified]

## Project Updates

### Google Ads
- **Updates:** Working on the statement of work for Google Ads.
- **Milestones Tracking:** Follow-up needed with Joanne regarding the Explore Georgia ad. New ads are available in the correct dimensions.

### Clip Trips
- **Updates:** Meeting held with Clip Trips representatives. Awaiting further communication from Travis to discuss next steps.

### Max Award Submission
- **Updates:** Currently gathering numbers from Telemundo and the website. A video script is being developed to pres